From 4c162b9d358f3ad9e0cfde72838412990f55f8a8 Mon Sep 17 00:00:00 2001 From: Hasan Genc Date: Sat, 19 Mar 2022 10:14:31 -0700 Subject: [PATCH 01/64] Mobilenet improvements (#209) Offloads depthwise convolution scheduling onto the loop-unroller FSM. This improves MobileNet performance by 2-3x. Also adds more printfs to `mobilenet.c` to show the performance of each individual layer. --- SPIKE.hash | 2 +- software/gemmini-rocc-tests | 2 +- src/main/scala/gemmini/Controller.scala | 2 +- src/main/scala/gemmini/GemminiConfigs.scala | 1 + src/main/scala/gemmini/GemminiISA.scala | 2 +- src/main/scala/gemmini/LoopConv.scala | 13 ++++++++++--- 6 files changed, 15 insertions(+), 7 deletions(-) diff --git a/SPIKE.hash b/SPIKE.hash index f08ac921..e1be3611 100644 --- a/SPIKE.hash +++ b/SPIKE.hash @@ -1 +1 @@ -090e82c473fd28b4eb2011ffcd771ead6076faab +bd19c16c779d419c322790b81a5ac57e47773c1c diff --git a/software/gemmini-rocc-tests b/software/gemmini-rocc-tests index e326e7c4..6ad94746 160000 --- a/software/gemmini-rocc-tests +++ b/software/gemmini-rocc-tests @@ -1 +1 @@ -Subproject commit e326e7c43457ff08669fe88edcaa395d846474d8 +Subproject commit 6ad94746cb72ba6824b101dc6a46d5e997cd17b3 diff --git a/src/main/scala/gemmini/Controller.scala b/src/main/scala/gemmini/Controller.scala index 3ff10955..cf572c0c 100644 --- a/src/main/scala/gemmini/Controller.scala +++ b/src/main/scala/gemmini/Controller.scala @@ -150,7 +150,7 @@ class GemminiModule[T <: Data: Arithmetic, U <: Data, V <: Data] new ConfigExRs1(acc_scale_t_bits), new PreloadRs(mvin_rows_bits, mvin_cols_bits, local_addr_t), new PreloadRs(mvout_rows_bits, mvout_cols_bits, local_addr_t), new ComputeRs(mvin_rows_bits, mvin_cols_bits, local_addr_t), new ComputeRs(mvin_rows_bits, mvin_cols_bits, local_addr_t), - has_training_convs, has_max_pool, has_first_layer_optimizations) } + has_training_convs, has_max_pool, has_first_layer_optimizations, has_dw_convs) } val (loop_cmd, loop_matmul_unroller_busy) = withClock (gated_clock) { LoopMatmul(conv_cmd, reservation_station.io.matmul_ld_completed, reservation_station.io.matmul_st_completed, reservation_station.io.matmul_ex_completed, meshRows*tileRows, coreMaxAddrBits, reservation_station_entries, max_lds, max_exs, max_sts, sp_banks * sp_bank_entries, acc_banks * acc_bank_entries, diff --git a/src/main/scala/gemmini/GemminiConfigs.scala b/src/main/scala/gemmini/GemminiConfigs.scala index 97e068c8..40c1c777 100644 --- a/src/main/scala/gemmini/GemminiConfigs.scala +++ b/src/main/scala/gemmini/GemminiConfigs.scala @@ -85,6 +85,7 @@ case class GemminiArrayConfig[T <: Data : Arithmetic, U <: Data, V <: Data]( has_training_convs: Boolean = true, has_max_pool: Boolean = true, has_nonlinear_activations: Boolean = true, + has_dw_convs: Boolean = true, has_first_layer_optimizations: Boolean = true, diff --git a/src/main/scala/gemmini/GemminiISA.scala b/src/main/scala/gemmini/GemminiISA.scala index 9cb15ac9..ea3aed12 100644 --- a/src/main/scala/gemmini/GemminiISA.scala +++ b/src/main/scala/gemmini/GemminiISA.scala @@ -24,7 +24,7 @@ object GemminiISA { val LOAD3_CMD = 14.U // TODO add orows and ocols to this as well - val LOOP_CONV_WS = 15.U // no_bias, wrot180, trans_output_1203, trans_weight_1203, trans_input_3120, max_pixels_per_row | no_pool, downsample, input_dilated, act + val LOOP_CONV_WS = 15.U // no_bias, wrot180, trans_output_1203, trans_weight_1203, trans_input_3120, dw, max_pixels_per_row | no_pool, downsample, input_dilated, act val LOOP_CONV_WS_CONFIG_1 = 16.U // batch_size, in_dim, in_channels, out_channels | out_dim, pool_out_dim, stride, padding val LOOP_CONV_WS_CONFIG_2 = 17.U // kernel_dim, pool_size, pool_stride, pool_padding | batches, porows, pocols, pochs val LOOP_CONV_WS_CONFIG_3 = 18.U // krows, kcols, kchs, lpad | rpad, upad, dpad, plpad diff --git a/src/main/scala/gemmini/LoopConv.scala b/src/main/scala/gemmini/LoopConv.scala index 16609f5a..3d7a099e 100644 --- a/src/main/scala/gemmini/LoopConv.scala +++ b/src/main/scala/gemmini/LoopConv.scala @@ -397,6 +397,7 @@ class LoopConvLdWeightReq(val coreMaxAddrBits: Int, val large_iterator_bitwidth: val dram_addr = UInt(coreMaxAddrBits.W) val trans_weight_1203 = Bool() val trans_weight_0132 = Bool() + val dw = Bool() val loop_id = UInt(log2Up(concurrent_loops).W) } @@ -439,6 +440,7 @@ class LoopConvLdWeight(block_size: Int, coreMaxAddrBits: Int, large_iterator_bit val addr_start = req.addr_end - B_rows val dram_stride = MuxCase(out_channels, Seq( + req.dw -> 1.U, req.trans_weight_1203 -> (kernel_dim * kernel_dim * out_channels), req.trans_weight_0132 -> in_channels )) * (input_w/8).U @@ -451,6 +453,7 @@ class LoopConvLdWeight(block_size: Int, coreMaxAddrBits: Int, large_iterator_bit // Addresses val dram_offset = MuxCase(((krow*kernel_dim*in_channels +& kcol*in_channels +& kch) * out_channels +& och) * (input_w/8).U, Seq( + req.dw -> (krow * kernel_dim +& kcol) * (input_w/8).U, req.trans_weight_1203 -> (((kch*kernel_dim*kernel_dim +& krow*kernel_dim +& kcol) * out_channels +& och) * (input_w/8).U), req.trans_weight_0132 -> (((krow*kernel_dim*out_channels +& kcol*out_channels +& och) * in_channels +& kch) * (input_w/8).U) )) @@ -1059,6 +1062,7 @@ class LoopConvState(val block_size: Int, val large_iterator_bitwidth: Int, val s val trans_weight_1203 = Bool() val trans_weight_0132 = Bool() val trans_input_3120 = Bool() + val dw = Bool() val max_pixels_per_row = UInt(small_iterator_bitwidth.W) @@ -1150,7 +1154,8 @@ class LoopConv (block_size: Int, coreMaxAddrBits: Int, reservation_station_size: config_mvin_rs1_t: ConfigMvinRs1, mvin_rs2_t: MvinRs2, config_mvout_rs2_t: ConfigMvoutRs2, mvout_rs2_t: MvoutRs2, config_ex_rs1_t: ConfigExRs1, preload_rs1_t: PreloadRs, preload_rs2_t: PreloadRs, compute_rs1_t: ComputeRs, compute_rs2_t: ComputeRs, - has_training_convs: Boolean, has_max_pool: Boolean, has_first_layer_optimizations: Boolean) + has_training_convs: Boolean, has_max_pool: Boolean, has_first_layer_optimizations: Boolean, + has_dw_convs: Boolean) (implicit p: Parameters) extends Module { val large_iterator_bitwidth = 16 val small_iterator_bitwidth = 16 // 8 @@ -1330,6 +1335,7 @@ class LoopConv (block_size: Int, coreMaxAddrBits: Int, reservation_station_size: loop_being_configured.trans_weight_1203 := has_training_convs.B && cmd.bits.cmd.rs1(3) loop_being_configured.trans_weight_0132 := has_training_convs.B && cmd.bits.cmd.rs1(4) loop_being_configured.trans_input_3120 := has_training_convs.B && cmd.bits.cmd.rs1(5) + loop_being_configured.dw := has_dw_convs.B && cmd.bits.cmd.rs1(6) loop_being_configured.no_pool := !has_max_pool.B || cmd.bits.cmd.rs2(0) loop_being_configured.activation := cmd.bits.cmd.rs2(4,3) @@ -1400,6 +1406,7 @@ class LoopConv (block_size: Int, coreMaxAddrBits: Int, reservation_station_size: ld_weights.io.req.bits.dram_addr := loop_requesting_ld_weights.weights_dram_addr ld_weights.io.req.bits.trans_weight_1203 := loop_requesting_ld_weights.trans_weight_1203 ld_weights.io.req.bits.trans_weight_0132 := loop_requesting_ld_weights.trans_weight_0132 + ld_weights.io.req.bits.dw := loop_requesting_ld_weights.dw ld_weights.io.req.bits.loop_id := loop_requesting_ld_weights_id ld_weights.io.req.valid := !loop_requesting_ld_weights.ld_weights_started && loop_requesting_ld_weights.configured @@ -1503,13 +1510,13 @@ object LoopConv { config_mvin_rs1_t: ConfigMvinRs1, mvin_rs2_t: MvinRs2, config_mvout_rs2_t: ConfigMvoutRs2, mvout_rs2_t: MvoutRs2, config_ex_rs1_t: ConfigExRs1, preload_rs1_t: PreloadRs, preload_rs2_t: PreloadRs, compute_rs1_t: ComputeRs, compute_rs2_t: ComputeRs, has_training_convs: Boolean, has_max_pool: Boolean, - has_first_layer_optimizations: Boolean) + has_first_layer_optimizations: Boolean, has_dw_convs: Boolean) (implicit p: Parameters): (DecoupledIO[GemminiCmd], Bool) = { val mod = Module(new LoopConv(block_size, coreMaxAddrBits, rob_size, max_lds, max_exs, max_sts, max_addr, max_acc_addr, input_w, acc_w, dma_max_bytes, config_mvin_rs1_t, mvin_rs2_t, config_mvout_rs2_t, mvout_rs2_t, config_ex_rs1_t, preload_rs1_t, preload_rs2_t, - compute_rs1_t, compute_rs2_t, has_training_convs, has_max_pool, has_first_layer_optimizations)) + compute_rs1_t, compute_rs2_t, has_training_convs, has_max_pool, has_first_layer_optimizations, has_dw_convs)) mod.io.in <> in mod.io.ld_completed := ld_completed From b9ccd8405d48ec9198921b9086740e2aa2e36192 Mon Sep 17 00:00:00 2001 From: Hasan Genc Date: Wed, 4 May 2022 14:38:58 -0700 Subject: [PATCH 02/64] Bump ort to 2021-12-23 (#218) Fixes ucb-bar/onnxruntime-riscv#76 --- software/onnxruntime-riscv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/software/onnxruntime-riscv b/software/onnxruntime-riscv index 7bbd0496..0c8c9b4f 160000 --- a/software/onnxruntime-riscv +++ b/software/onnxruntime-riscv @@ -1 +1 @@ -Subproject commit 7bbd0496b579863c6906c0449932ac5ddc4c5357 +Subproject commit 0c8c9b4f881b5f31d32c6b5a76cac4ee14a8f338 From 073e073ca8552ab5e23ec7bc31f65aeedccbcf34 Mon Sep 17 00:00:00 2001 From: Hasan Genc Date: Tue, 23 Aug 2022 09:45:09 -0700 Subject: [PATCH 03/64] Add support for I-BERT's layernorm, softmax, and gelu ops (#237) --- SPIKE.hash | 2 +- scripts/build-onnx-inference.sh | 2 +- scripts/build-vcs.sh | 9 +- scripts/build-verilator.sh | 7 +- software/gemmini-ort.json | 2 +- software/gemmini-rocc-tests | 2 +- software/gemmini-tests-interactive.json | 3 +- software/gemmini-tests.json | 3 +- src/main/scala/gemmini/AccumulatorMem.scala | 46 +- src/main/scala/gemmini/AccumulatorScale.scala | 193 ++++-- src/main/scala/gemmini/Activation.scala | 6 +- src/main/scala/gemmini/Arithmetic.scala | 276 ++++++-- src/main/scala/gemmini/Configs.scala | 2 +- src/main/scala/gemmini/Controller.scala | 1 - src/main/scala/gemmini/CustomConfigs.scala | 8 + .../scala/gemmini/DMACommandTracker.scala | 1 - .../scala/gemmini/ExecuteController.scala | 20 +- src/main/scala/gemmini/GemminiConfigs.scala | 7 +- src/main/scala/gemmini/GemminiISA.scala | 36 +- src/main/scala/gemmini/LocalAddr.scala | 9 +- src/main/scala/gemmini/LoopConv.scala | 4 +- src/main/scala/gemmini/LoopMatmul.scala | 110 ++- src/main/scala/gemmini/NormCmd.scala | 23 + src/main/scala/gemmini/Normalizer.scala | 635 ++++++++++++++++++ .../scala/gemmini/ReservationStation.scala | 10 +- src/main/scala/gemmini/Scratchpad.scala | 149 ++-- src/main/scala/gemmini/StoreController.scala | 71 +- 27 files changed, 1396 insertions(+), 241 deletions(-) create mode 100644 src/main/scala/gemmini/NormCmd.scala create mode 100644 src/main/scala/gemmini/Normalizer.scala diff --git a/SPIKE.hash b/SPIKE.hash index e1be3611..27baea53 100644 --- a/SPIKE.hash +++ b/SPIKE.hash @@ -1 +1 @@ -bd19c16c779d419c322790b81a5ac57e47773c1c +2ed403a70f65559a3c2a06bf724d4737edc73a23 diff --git a/scripts/build-onnx-inference.sh b/scripts/build-onnx-inference.sh index 23742f5c..01d6e8ce 100755 --- a/scripts/build-onnx-inference.sh +++ b/scripts/build-onnx-inference.sh @@ -1,6 +1,6 @@ #!/bin/bash -cd /root/chipyard/generators/gemmini/software/onnxruntime-riscv/ +cd ./software/onnxruntime-riscv/ rm -rf ./build/ ./build.sh --parallel --enable_training --config=Debug --cmake_extra_defines onnxruntime_USE_SYSTOLIC=ON onnxruntime_SYSTOLIC_INT8=ON onnxruntime_SYSTOLIC_FP32=OFF cd ./systolic_runner/imagenet_runner/ diff --git a/scripts/build-vcs.sh b/scripts/build-vcs.sh index e3213521..b15a23f2 100755 --- a/scripts/build-vcs.sh +++ b/scripts/build-vcs.sh @@ -4,21 +4,24 @@ help () { echo "Build a cycle-accurate VCS simulator for RISCV Gemmini programs," echo 'matching `customConfig` in `configs/GemminiCustomConfigs.scala`.' echo - echo "Usage: $0 [-h|--help] [--debug]" + echo "Usage: $0 [-h|--help] [--debug] [-j [N]]" echo echo "Options:" echo " debug Builds a VCS simulator which generates waveforms. Without this" echo " option, the simulator will not generate any waveforms." + echo " j [N] Allow N jobs at once; infinite jobs with no arg." exit } show_help=0 debug="" +j="1" while [ $# -gt 0 ] ; do case $1 in -h | --help) show_help=1 ;; - --debug) debug="debug" + --debug) debug="debug" ;; + -j) j=$1 esac shift @@ -29,5 +32,5 @@ if [ $show_help -eq 1 ]; then fi cd ../../sims/vcs/ -make ${debug} CONFIG=CustomGemminiSoCConfig +make -j$j ${debug} CONFIG=CustomGemminiSoCConfig diff --git a/scripts/build-verilator.sh b/scripts/build-verilator.sh index 965d335b..49a25f29 100755 --- a/scripts/build-verilator.sh +++ b/scripts/build-verilator.sh @@ -4,21 +4,24 @@ help () { echo "Build a cycle-accurate Verilator simulator for RISCV Gemmini programs," echo 'matching `customConfig` in `configs/GemminiCustomConfigs.scala`.' echo - echo "Usage: $0 [-h|--help] [--debug]" + echo "Usage: $0 [-h|--help] [--debug] [-j [N]]" echo echo "Options:" echo " debug Builds a Verilator simulator which generates waveforms. Without" echo " this option, the simulator will not generate any waveforms." + echo " j [N] Allow N jobs at once; infinite jobs with no arg." exit } show_help=0 debug="" +j="1" while [ $# -gt 0 ] ; do case $1 in -h | --help) show_help=1 ;; - --debug) debug="debug" + --debug) debug="debug" ;; + -j) j=$1 esac shift diff --git a/software/gemmini-ort.json b/software/gemmini-ort.json index 7f561d79..c4a95253 100644 --- a/software/gemmini-ort.json +++ b/software/gemmini-ort.json @@ -52,7 +52,7 @@ "/output/mobilenet_optimized_ws_nhwc_out.txt" ], "overlay": "../onnxruntime-riscv/systolic_runner/imagenet_runner", - "rootfs-size": "1GiB", + "rootfs-size": "16GiB", "run": "run-ort.sh" } diff --git a/software/gemmini-rocc-tests b/software/gemmini-rocc-tests index 6ad94746..37464740 160000 --- a/software/gemmini-rocc-tests +++ b/software/gemmini-rocc-tests @@ -1 +1 @@ -Subproject commit 6ad94746cb72ba6824b101dc6a46d5e997cd17b3 +Subproject commit 374647403d5e73543463f9f66f730aa16bc8e362 diff --git a/software/gemmini-tests-interactive.json b/software/gemmini-tests-interactive.json index 8bd5f7ea..0fe52409 100644 --- a/software/gemmini-tests-interactive.json +++ b/software/gemmini-tests-interactive.json @@ -3,5 +3,6 @@ "workdir" : ".", "base" : "br-base.json", "overlay" : "overlay", - "host-init" : "host-init.sh" + "host-init" : "host-init.sh", + "rootfs-size" : "16GiB" } diff --git a/software/gemmini-tests.json b/software/gemmini-tests.json index 72f8661c..fc0e45a9 100644 --- a/software/gemmini-tests.json +++ b/software/gemmini-tests.json @@ -4,5 +4,6 @@ "base" : "br-base.json", "overlay" : "overlay", "host-init" : "host-init.sh", - "command": "/root/run-tests.sh" + "command": "/root/run-tests.sh", + "rootfs-size" : "16GiB" } diff --git a/src/main/scala/gemmini/AccumulatorMem.scala b/src/main/scala/gemmini/AccumulatorMem.scala index f8b62298..dd5ed821 100644 --- a/src/main/scala/gemmini/AccumulatorMem.scala +++ b/src/main/scala/gemmini/AccumulatorMem.scala @@ -5,30 +5,35 @@ import chisel3.util._ import Util._ -class AccumulatorReadReq[T <: Data](n: Int, shift_width: Int, scale_t: T) extends Bundle { +class AccumulatorReadReq[T <: Data: Arithmetic, U <: Data](n: Int, acc_t: T, scale_t: U) extends Bundle { val addr = UInt(log2Ceil(n).W) val scale = scale_t - val relu6_shift = UInt(shift_width.W) - val act = UInt(2.W) // TODO magic number + val igelu_qb = acc_t.cloneType + val igelu_qc = acc_t.cloneType + val iexp_qln2 = acc_t.cloneType + val iexp_qln2_inv = acc_t.cloneType + val act = UInt(Activation.bitwidth.W) // TODO magic number val full = Bool() // Whether or not we return the full bitwidth output val fromDMA = Bool() } -class AccumulatorReadResp[T <: Data: Arithmetic, U <: Data](fullDataType: Vec[Vec[T]], scale_t: U, shift_width: Int) extends Bundle { +class AccumulatorReadResp[T <: Data: Arithmetic, U <: Data](fullDataType: Vec[Vec[T]], scale_t: U) extends Bundle { val data = fullDataType.cloneType val fromDMA = Bool() val scale = scale_t.cloneType - val relu6_shift = UInt(shift_width.W) - val act = UInt(2.W) // TODO magic number - val acc_bank_id = UInt(2.W) // TODO don't hardcode + val igelu_qb = fullDataType.head.head.cloneType + val igelu_qc = fullDataType.head.head.cloneType + val iexp_qln2 = fullDataType.head.head.cloneType + val iexp_qln2_inv = fullDataType.head.head.cloneType + val act = UInt(Activation.bitwidth.W) // TODO magic number + val acc_bank_id = UInt(2.W) // TODO magic number } -class AccumulatorReadIO[T <: Data: Arithmetic, U <: Data](n: Int, shift_width: Int, fullDataType: Vec[Vec[T]], scale_t: U) extends Bundle { - val req = Decoupled(new AccumulatorReadReq[U](n, shift_width, scale_t)) - val resp = Flipped(Decoupled(new AccumulatorReadResp[T, U](fullDataType, scale_t, shift_width))) - +class AccumulatorReadIO[T <: Data: Arithmetic, U <: Data](n: Int, fullDataType: Vec[Vec[T]], scale_t: U) extends Bundle { + val req = Decoupled(new AccumulatorReadReq[T, U](n, fullDataType.head.head.cloneType, scale_t)) + val resp = Flipped(Decoupled(new AccumulatorReadResp[T, U](fullDataType, scale_t))) } class AccumulatorWriteReq[T <: Data: Arithmetic](n: Int, t: Vec[Vec[T]]) extends Bundle { @@ -36,15 +41,13 @@ class AccumulatorWriteReq[T <: Data: Arithmetic](n: Int, t: Vec[Vec[T]]) extends val data = t.cloneType val acc = Bool() val mask = Vec(t.getWidth / 8, Bool()) // TODO Use aligned_to here - // val current_waddr = Flipped(Valid(UInt(log2Ceil(n).W))) // This is the raddr that is being fed into the SRAM right now - } class AccumulatorMemIO [T <: Data: Arithmetic, U <: Data](n: Int, t: Vec[Vec[T]], scale_t: U, acc_sub_banks: Int, use_shared_ext_mem: Boolean ) extends Bundle { - val read = Flipped(new AccumulatorReadIO(n, log2Ceil(t.head.head.getWidth), t, scale_t)) + val read = Flipped(new AccumulatorReadIO(n, t, scale_t)) val write = Flipped(Decoupled(new AccumulatorWriteReq(n, t))) val ext_mem = if (use_shared_ext_mem) Some(Vec(acc_sub_banks, new ExtMemIO)) else None @@ -55,7 +58,6 @@ class AccumulatorMemIO [T <: Data: Arithmetic, U <: Data](n: Int, t: Vec[Vec[T]] val op2 = Output(t.cloneType) val sum = Input(t.cloneType) } - } class AccPipe[T <: Data : Arithmetic](latency: Int, t: T)(implicit ev: Arithmetic[T]) extends Module { @@ -98,8 +100,6 @@ class AccumulatorMem[T <: Data, U <: Data]( // to it, then we might not get the written data. We might need some kind of cooldown counter after addresses in the // accumulator have been written to for configurations with such small matrices - // TODO Refuse a read from an address which has only just been written to - // TODO make a new aligned_to variable specifically for AccumulatorMem. We should assume that inputs are at least // accType.getWidth/8 aligned, because it won't make sense to do matrix additions directly in the DMA otherwise. @@ -291,7 +291,7 @@ class AccumulatorMem[T <: Data, U <: Data]( } } - val q = Module(new Queue(new AccumulatorReadResp(t, scale_t, log2Ceil(t.head.head.getWidth)), 1, true, true)) + val q = Module(new Queue(new AccumulatorReadResp(t, scale_t), 1, true, true)) q.io.enq.bits.data := rdata_for_read_resp if (is_dummy) { @@ -300,7 +300,10 @@ class AccumulatorMem[T <: Data, U <: Data]( } q.io.enq.bits.scale := RegNext(io.read.req.bits.scale) - q.io.enq.bits.relu6_shift := RegNext(io.read.req.bits.relu6_shift) + q.io.enq.bits.igelu_qb := RegNext(io.read.req.bits.igelu_qb) + q.io.enq.bits.igelu_qc := RegNext(io.read.req.bits.igelu_qc) + q.io.enq.bits.iexp_qln2 := RegNext(io.read.req.bits.iexp_qln2) + q.io.enq.bits.iexp_qln2_inv := RegNext(io.read.req.bits.iexp_qln2_inv) q.io.enq.bits.act := RegNext(io.read.req.bits.act) q.io.enq.bits.fromDMA := RegNext(io.read.req.bits.fromDMA) q.io.enq.bits.acc_bank_id := DontCare @@ -310,7 +313,10 @@ class AccumulatorMem[T <: Data, U <: Data]( io.read.resp.bits.data := p.bits.data io.read.resp.bits.fromDMA := p.bits.fromDMA - io.read.resp.bits.relu6_shift := p.bits.relu6_shift + io.read.resp.bits.igelu_qb := p.bits.igelu_qb + io.read.resp.bits.igelu_qc := p.bits.igelu_qc + io.read.resp.bits.iexp_qln2 := p.bits.iexp_qln2 + io.read.resp.bits.iexp_qln2_inv := p.bits.iexp_qln2_inv io.read.resp.bits.act := p.bits.act io.read.resp.bits.scale := p.bits.scale io.read.resp.bits.acc_bank_id := DontCare // This is set in Scratchpad diff --git a/src/main/scala/gemmini/AccumulatorScale.scala b/src/main/scala/gemmini/AccumulatorScale.scala index 2d23af1d..1fdd15fa 100644 --- a/src/main/scala/gemmini/AccumulatorScale.scala +++ b/src/main/scala/gemmini/AccumulatorScale.scala @@ -1,16 +1,16 @@ + package gemmini import chisel3._ import chisel3.util._ - import Util._ -class AccumulatorReadRespWithFullData[T <: Data: Arithmetic, U <: Data](fullDataType: Vec[Vec[T]], scale_t: U, shift_width: Int) extends Bundle { - val resp = new AccumulatorReadResp(fullDataType, scale_t, shift_width) +class AccumulatorReadRespWithFullData[T <: Data: Arithmetic, U <: Data](fullDataType: Vec[Vec[T]], scale_t: U) + extends Bundle { + val resp = new AccumulatorReadResp(fullDataType, scale_t) val full_data = fullDataType.cloneType } - class AccumulatorScaleResp[T <: Data: Arithmetic](fullDataType: Vec[Vec[T]], rDataType: Vec[Vec[T]]) extends Bundle { val full_data = fullDataType.cloneType val data = rDataType.cloneType @@ -19,26 +19,33 @@ class AccumulatorScaleResp[T <: Data: Arithmetic](fullDataType: Vec[Vec[T]], rDa } class AccumulatorScaleIO[T <: Data: Arithmetic, U <: Data]( - fullDataType: Vec[Vec[T]], scale_t: U, shift_width: Int, + fullDataType: Vec[Vec[T]], scale_t: U, rDataType: Vec[Vec[T]] ) extends Bundle { - val in = Flipped(Decoupled(new AccumulatorReadResp[T,U](fullDataType, scale_t, shift_width))) + val in = Flipped(Decoupled(new NormalizedOutput[T,U](fullDataType, scale_t))) val out = Decoupled(new AccumulatorScaleResp[T](fullDataType, rDataType)) } class AccScaleDataWithIndex[T <: Data: Arithmetic, U <: Data](t: T, u: U) extends Bundle { - val shift_width = log2Ceil(t.getWidth) - val scale = u.cloneType val act = UInt(2.W) // TODO magic number - val relu6_shift = UInt(shift_width.W) + val igelu_qb = t.cloneType + val igelu_qc = t.cloneType + val iexp_qln2 = t.cloneType + val iexp_qln2_inv = t.cloneType + val mean = t.cloneType + val max = t.cloneType + val inv_stddev = u.cloneType + val inv_sum_exp = u.cloneType val data = t.cloneType val full_data = t.cloneType val id = UInt(2.W) // TODO hardcoded val index = UInt() } -class AccScalePipe[T <: Data : Arithmetic, U <: Data](t: T, rDataType: Vec[Vec[T]], scale_func: (T, U) => T, scale_t: U, latency: Int, has_nonlinear_activations: Boolean)(implicit ev: Arithmetic[T]) extends Module { +class AccScalePipe[T <: Data, U <: Data](t: T, rDataType: Vec[Vec[T]], scale_func: (T, U) => T, scale_t: U, + latency: Int, has_nonlinear_activations: Boolean, has_normalizations: Boolean) + (implicit ev: Arithmetic[T]) extends Module { val u = scale_t val io = IO(new Bundle { val in = Input(Valid(new AccScaleDataWithIndex(t, u)(ev))) @@ -47,68 +54,97 @@ class AccScalePipe[T <: Data : Arithmetic, U <: Data](t: T, rDataType: Vec[Vec[T import ev._ val out = WireInit(io.in) - val e_scaled = scale_func(io.in.bits.data, io.in.bits.scale) + val e = io.in.bits.data + + val e_act = MuxCase(e, Seq( + (has_nonlinear_activations.B && io.in.bits.act === Activation.RELU) -> e.relu, + (has_nonlinear_activations.B && has_normalizations.B && io.in.bits.act === Activation.LAYERNORM) -> + (e - io.in.bits.mean).mult_with_reciprocal(io.in.bits.inv_stddev), + (has_nonlinear_activations.B && has_normalizations.B && io.in.bits.act === Activation.IGELU) -> + AccumulatorScale.igelu(e, io.in.bits.igelu_qb, io.in.bits.igelu_qc), + (has_nonlinear_activations.B && has_normalizations.B && io.in.bits.act === Activation.SOFTMAX) -> + scale_func( + AccumulatorScale.iexp(e - io.in.bits.max, io.in.bits.iexp_qln2, io.in.bits.iexp_qln2_inv, io.in.bits.igelu_qb, io.in.bits.igelu_qc), + io.in.bits.inv_sum_exp.asTypeOf(scale_t)), + )) + + val e_scaled = scale_func(e_act, io.in.bits.scale) val e_clipped = e_scaled.clippedToWidthOf(rDataType.head.head) - val e_act = MuxCase(e_clipped, Seq( - (has_nonlinear_activations.B && io.in.bits.act === Activation.RELU) -> e_clipped.relu, - (has_nonlinear_activations.B && io.in.bits.act === Activation.RELU6) -> e_clipped.relu6(io.in.bits.relu6_shift))) - out.bits.data := e_act + out.bits.data := e_clipped io.out := Pipe(out, latency) } -class AccumulatorScale[T <: Data: Arithmetic, U <: Data]( +class AccumulatorScale[T <: Data, U <: Data]( fullDataType: Vec[Vec[T]], rDataType: Vec[Vec[T]], - scale_t: U, shift_width: Int, + scale_t: U, read_small_data: Boolean, read_full_data: Boolean, scale_func: (T, U) => T, num_scale_units: Int, latency: Int, - has_nonlinear_activations: Boolean)(implicit ev: Arithmetic[T]) extends Module { + has_nonlinear_activations: Boolean, has_normalizations: Boolean)(implicit ev: Arithmetic[T]) extends Module { import ev._ val io = IO(new AccumulatorScaleIO[T,U]( - fullDataType, scale_t, shift_width, rDataType + fullDataType, scale_t, rDataType )(ev)) - val t = io.in.bits.data(0)(0).cloneType + val t = io.in.bits.acc_read_resp.data(0)(0).cloneType + val acc_read_data = io.in.bits.acc_read_resp.data val out = Wire(Decoupled(new AccumulatorScaleResp[T]( fullDataType, rDataType)(ev))) if (num_scale_units == -1) { - val in = Wire(Decoupled(new AccumulatorReadRespWithFullData(fullDataType, scale_t, shift_width)(ev))) + val data = io.in.bits.acc_read_resp.data + val act = io.in.bits.acc_read_resp.act + val igelu_qb = io.in.bits.acc_read_resp.igelu_qb + val igelu_qc = io.in.bits.acc_read_resp.igelu_qc + val iexp_qln2 = io.in.bits.acc_read_resp.iexp_qln2 + val iexp_qln2_inv = io.in.bits.acc_read_resp.iexp_qln2_inv + val scale = io.in.bits.acc_read_resp.scale + + val activated_data = VecInit(data.map(v => VecInit(v.map { e => + val e_act = MuxCase(e, Seq( + (has_nonlinear_activations.B && act === Activation.RELU) -> e.relu, + (has_nonlinear_activations.B && has_normalizations.B && act === Activation.LAYERNORM) -> + (e - io.in.bits.mean).mult_with_reciprocal(io.in.bits.inv_stddev), + (has_nonlinear_activations.B && has_normalizations.B && act === Activation.IGELU) -> + AccumulatorScale.igelu(e, igelu_qb, igelu_qc), + (has_nonlinear_activations.B && has_normalizations.B && act === Activation.SOFTMAX) -> + scale_func( + AccumulatorScale.iexp(e - io.in.bits.max, iexp_qln2, iexp_qln2_inv, igelu_qb, igelu_qc), + io.in.bits.inv_sum_exp.asTypeOf(scale_t)), + )) + + val e_scaled = scale_func(e_act, scale) + val e_clipped = e_scaled.clippedToWidthOf(rDataType.head.head) + + e_clipped + }))) + + val in = Wire(Decoupled(new AccumulatorReadRespWithFullData(fullDataType, scale_t)(ev))) in.valid := io.in.valid io.in.ready := in.ready - in.bits.resp := io.in.bits - in.bits.full_data := io.in.bits.data - - val pipe_out = Pipeline(in, latency, Seq.fill(latency)((x: AccumulatorReadRespWithFullData[T,U]) => x) :+ { - x: AccumulatorReadRespWithFullData[T,U] => - val activated_rdata = VecInit(x.resp.data.map(v => VecInit(v.map { e => - val e_scaled = scale_func(e, x.resp.scale) - val e_clipped = e_scaled.clippedToWidthOf(rDataType.head.head) - val e_act = MuxCase(e_clipped, Seq( - (x.resp.act === Activation.RELU) -> e_clipped.relu, - (x.resp.act === Activation.RELU6) -> e_clipped.relu6(x.resp.relu6_shift))) - - e_act - }))) - val result = WireInit(x) - result.resp.data := activated_rdata - result - }) - out.valid := pipe_out.valid + in.bits.resp := io.in.bits.acc_read_resp + in.bits.full_data := acc_read_data + in.bits.resp.data := activated_data + + val pipe_out = Pipeline(in, latency) + + out.valid := pipe_out.valid pipe_out.ready := out.ready out.bits.full_data := pipe_out.bits.full_data out.bits.data := pipe_out.bits.resp.data out.bits.fromDMA := pipe_out.bits.resp.fromDMA out.bits.acc_bank_id := pipe_out.bits.resp.acc_bank_id } else { - val width = io.in.bits.data.size * io.in.bits.data(0).size + val width = acc_read_data.size * acc_read_data(0).size val nEntries = 3 - val regs = Reg(Vec(nEntries, Valid(new AccumulatorReadResp[T,U]( - fullDataType, scale_t, shift_width)(ev)))) + /*val regs = Reg(Vec(nEntries, Valid(new AccumulatorReadResp[T,U]( + fullDataType, scale_t)(ev))))*/ + val regs = Reg(Vec(nEntries, Valid(new NormalizedOutput[T,U]( + fullDataType, scale_t)(ev)))) val out_regs = Reg(Vec(nEntries, new AccumulatorScaleResp[T]( fullDataType, rDataType)(ev))) @@ -124,7 +160,7 @@ class AccumulatorScale[T <: Data: Arithmetic, U <: Data]( regs(i).valid := false.B } } - head_oh := (head_oh << 1) | head_oh(nEntries-1) + head_oh := (head_oh << 1).asUInt() | head_oh(nEntries-1) } io.in.ready := !Mux1H(tail_oh.asBools, regs.map(_.valid)) || (tail_oh === head_oh && out.fire) @@ -133,13 +169,13 @@ class AccumulatorScale[T <: Data: Arithmetic, U <: Data]( when (tail_oh(i)) { regs(i).valid := true.B regs(i).bits := io.in.bits - out_regs(i).fromDMA := io.in.bits.fromDMA - out_regs(i).acc_bank_id := io.in.bits.acc_bank_id + out_regs(i).fromDMA := io.in.bits.acc_read_resp.fromDMA + out_regs(i).acc_bank_id := io.in.bits.acc_read_resp.acc_bank_id fired_masks(i).foreach(_ := false.B) completed_masks(i).foreach(_ := false.B) } } - tail_oh := (tail_oh << 1) | tail_oh(nEntries-1) + tail_oh := (tail_oh << 1).asUInt() | tail_oh(nEntries-1) } val inputs = Seq.fill(width*nEntries) { Wire(Decoupled(new AccScaleDataWithIndex(t, scale_t)(ev))) } @@ -147,12 +183,22 @@ class AccumulatorScale[T <: Data: Arithmetic, U <: Data]( for (i <- 0 until nEntries) { for (w <- 0 until width) { val input = inputs(i*width+w) + + val acc_read_resp = regs(i).bits.acc_read_resp + input.valid := regs(i).valid && !fired_masks(i)(w) - input.bits.data := regs(i).bits.data(w / io.in.bits.data(0).size)(w % io.in.bits.data(0).size) - input.bits.full_data := regs(i).bits.data(w / io.in.bits.data(0).size)(w % io.in.bits.data(0).size) - input.bits.scale := regs(i).bits.scale - input.bits.act := regs(i).bits.act - input.bits.relu6_shift := regs(i).bits.relu6_shift + input.bits.data := acc_read_resp.data(w / acc_read_data(0).size)(w % acc_read_data(0).size) + input.bits.full_data := acc_read_resp.data(w / acc_read_data(0).size)(w % acc_read_data(0).size) + input.bits.scale := acc_read_resp.scale + input.bits.act := acc_read_resp.act + input.bits.igelu_qb := acc_read_resp.igelu_qb + input.bits.igelu_qc := acc_read_resp.igelu_qc + input.bits.iexp_qln2 := acc_read_resp.iexp_qln2 + input.bits.iexp_qln2_inv := acc_read_resp.iexp_qln2_inv + input.bits.mean := regs(i).bits.mean + input.bits.max := regs(i).bits.max + input.bits.inv_stddev := regs(i).bits.inv_stddev + input.bits.inv_sum_exp := regs(i).bits.inv_sum_exp input.bits.id := i.U input.bits.index := w.U when (input.fire) { @@ -171,15 +217,16 @@ class AccumulatorScale[T <: Data: Arithmetic, U <: Data]( when (reset.asBool) { arbOut.valid := false.B } - val pipe = Module(new AccScalePipe(t, rDataType, scale_func, scale_t, latency, has_nonlinear_activations)(ev, ev)) + val pipe = Module(new AccScalePipe(t, rDataType, scale_func, scale_t, latency, has_nonlinear_activations, + has_normalizations)) pipe.io.in := arbOut val pipe_out = pipe.io.out for (j <- 0 until nEntries) { for (w <- 0 until width) { if ((j*width+w) % num_scale_units == i) { - val id0 = w % io.in.bits.data(0).size - val id1 = w / io.in.bits.data(0).size + val id0 = w % acc_read_data(0).size + val id1 = w / acc_read_data(0).size when (pipe_out.fire && pipe_out.bits.id === j.U && pipe_out.bits.index === w.U) { out_regs(j).data (id1)(id0) := pipe_out.bits.data out_regs(j).full_data(id1)(id0) := pipe_out.bits.full_data @@ -205,6 +252,40 @@ class AccumulatorScale[T <: Data: Arithmetic, U <: Data]( io.out.bits.full_data := out.bits.full_data else io.out.bits.full_data := DontCare - } +object AccumulatorScale { + def igelu[T <: Data](q: T, qb: T, qc: T)(implicit ev: Arithmetic[T]): T = { + import ev._ + + val zero = q.zero + val one = q.identity + def neg(x: T) = zero-x + + val q_sign = Mux(q.zero > q, neg(one), one) + val q_abs = Mux(q.zero > q, neg(q), q) + val q_clipped = Mux(q_abs > neg(qb), neg(qb), q_abs) + val q_poly = qc.mac(q_clipped + qb, q_clipped + qb).withWidthOf(q) + val q_erf = (q_sign * q_poly).withWidthOf(q) + (q * (q_erf + qc)).withWidthOf(q) + } + + def iexp[T <: Data](q: T, qln2: T, qln2_inv: T, qb: T, qc: T)(implicit ev: Arithmetic[T]): T = { + import ev._ + + val zero = q.zero + def neg(x: T) = zero-x + + // qln2_inv needs scale to be + // 1 / (2 ** 16) / S + + // qln2_inv / S / (2 ** 16) = 1 / ln2 + // q * qln2_inv = x / S / ln2 * S * (2 ** 16) = x / ln2 * (2 ** 16) + val neg_q_iexp = neg(q) + val z_iexp = (neg_q_iexp * qln2_inv).asUInt().do_>>(16).asTypeOf(q) // q is non-positive + val qp_iexp = q.mac(z_iexp, qln2).withWidthOf(q) + val q_poly_iexp = qc.mac(qp_iexp + qb, qp_iexp + qb).withWidthOf(q) + // we dont want a rounding shift + (q_poly_iexp.asUInt().do_>>(z_iexp.asUInt()(5, 0))).asTypeOf(q) + }} + diff --git a/src/main/scala/gemmini/Activation.scala b/src/main/scala/gemmini/Activation.scala index ed7df57f..1b7d94e6 100644 --- a/src/main/scala/gemmini/Activation.scala +++ b/src/main/scala/gemmini/Activation.scala @@ -5,5 +5,9 @@ import chisel3._ object Activation { val NONE = 0.U val RELU = 1.U - val RELU6 = 2.U + val LAYERNORM = 2.U + val IGELU = 3.U + val SOFTMAX = 4.U + + val bitwidth = 3 } diff --git a/src/main/scala/gemmini/Arithmetic.scala b/src/main/scala/gemmini/Arithmetic.scala index 4f8e9343..cdd36396 100644 --- a/src/main/scala/gemmini/Arithmetic.scala +++ b/src/main/scala/gemmini/Arithmetic.scala @@ -32,14 +32,21 @@ abstract class ArithmeticOps[T <: Data](self: T) { def *(t: T): T def mac(m1: T, m2: T): T // Returns (m1 * m2 + self) def +(t: T): T + def -(t: T): T def >>(u: UInt): T // This is a rounding shift! Rounds away from 0 def >(t: T): Bool def identity: T def withWidthOf(t: T): T def clippedToWidthOf(t: T): T // Like "withWidthOf", except that it saturates def relu: T - def relu6(shift: UInt): T def zero: T + def minimum: T + + // Optional parameters, which only need to be defined if you want to enable various optimizations for transformers + def divider(denom_t: UInt): Option[(DecoupledIO[UInt], DecoupledIO[T])] = None + def sqrt: Option[(DecoupledIO[UInt], DecoupledIO[T])] = None + def reciprocal[U <: Data](u: U): Option[(DecoupledIO[UInt], DecoupledIO[U])] = None + def mult_with_reciprocal[U <: Data](reciprocal: U) = self } object Arithmetic { @@ -48,6 +55,7 @@ object Arithmetic { override def *(t: UInt) = self * t override def mac(m1: UInt, m2: UInt) = m1 * m2 + self override def +(t: UInt) = self + t + override def -(t: UInt) = self - t override def >>(u: UInt) = { // The equation we use can be found here: https://riscv.github.io/documents/riscv-v-spec/#_vector_fixed_point_rounding_mode_register_vxrm @@ -72,15 +80,10 @@ object Arithmetic { } override def relu: UInt = self - override def relu6(shift: UInt): UInt = { - val max6 = (6.U << shift).asUInt() - val maxwidth = ((1 << (self.getWidth-1))-1).U - val max = Mux(max6 > maxwidth, maxwidth, max6)(self.getWidth-1, 0).asUInt() - Mux(self < max, self, max) - } override def zero: UInt = 0.U override def identity: UInt = 1.U + override def minimum: UInt = 0.U } } @@ -89,6 +92,7 @@ object Arithmetic { override def *(t: SInt) = self * t override def mac(m1: SInt, m2: SInt) = m1 * m2 + self override def +(t: SInt) = self + t + override def -(t: SInt) = self - t override def >>(u: UInt) = { // The equation we use can be found here: https://riscv.github.io/documents/riscv-v-spec/#_vector_fixed_point_rounding_mode_register_vxrm @@ -122,15 +126,204 @@ object Arithmetic { } override def relu: SInt = Mux(self >= 0.S, self, 0.S) - override def relu6(shift: UInt): SInt = { - val max6 = (6.S << shift).asSInt() - val maxwidth = ((1 << (self.getWidth-1))-1).S - val max = Mux(max6 > maxwidth, maxwidth, max6)(self.getWidth-1, 0).asSInt() - MuxCase(self, Seq((self < 0.S) -> 0.S, (self > max) -> max)) - } override def zero: SInt = 0.S override def identity: SInt = 1.S + override def minimum: SInt = (-(1 << (self.getWidth-1))).S + + override def divider(denom_t: UInt): Option[(DecoupledIO[UInt], DecoupledIO[SInt])] = { + // TODO this uses a floating point divider, but we should use an integer divider instead + + val input = Wire(Decoupled(denom_t.cloneType)) + val output = Wire(Decoupled(self.cloneType)) + + // We translate our integer to floating-point form so that we can use the hardfloat divider + val expWidth = log2Up(self.getWidth) + 1 + val sigWidth = self.getWidth + + def sin_to_float(x: SInt) = { + val in_to_rec_fn = Module(new INToRecFN(intWidth = self.getWidth, expWidth, sigWidth)) + in_to_rec_fn.io.signedIn := true.B + in_to_rec_fn.io.in := x.asUInt() + in_to_rec_fn.io.roundingMode := consts.round_minMag // consts.round_near_maxMag + in_to_rec_fn.io.detectTininess := consts.tininess_afterRounding + + in_to_rec_fn.io.out + } + + def uin_to_float(x: UInt) = { + val in_to_rec_fn = Module(new INToRecFN(intWidth = self.getWidth, expWidth, sigWidth)) + in_to_rec_fn.io.signedIn := false.B + in_to_rec_fn.io.in := x + in_to_rec_fn.io.roundingMode := consts.round_minMag // consts.round_near_maxMag + in_to_rec_fn.io.detectTininess := consts.tininess_afterRounding + + in_to_rec_fn.io.out + } + + def float_to_in(x: UInt) = { + val rec_fn_to_in = Module(new RecFNToIN(expWidth = expWidth, sigWidth, self.getWidth)) + rec_fn_to_in.io.signedOut := true.B + rec_fn_to_in.io.in := x + rec_fn_to_in.io.roundingMode := consts.round_minMag // consts.round_near_maxMag + + rec_fn_to_in.io.out.asSInt() + } + + val self_rec = sin_to_float(self) + val denom_rec = uin_to_float(input.bits) + + // Instantiate the hardloat divider + val divider = Module(new DivSqrtRecFN_small(expWidth, sigWidth, 0)) + + input.ready := divider.io.inReady + divider.io.inValid := input.valid + divider.io.sqrtOp := false.B + divider.io.a := self_rec + divider.io.b := denom_rec + divider.io.roundingMode := consts.round_minMag + divider.io.detectTininess := consts.tininess_afterRounding + + output.valid := divider.io.outValid_div + output.bits := float_to_in(divider.io.out) + + assert(!output.valid || output.ready) + + Some((input, output)) + } + + override def sqrt: Option[(DecoupledIO[UInt], DecoupledIO[SInt])] = { + // TODO this uses a floating point divider, but we should use an integer divider instead + + val input = Wire(Decoupled(UInt(0.W))) + val output = Wire(Decoupled(self.cloneType)) + + input.bits := DontCare + + // We translate our integer to floating-point form so that we can use the hardfloat divider + val expWidth = log2Up(self.getWidth) + 1 + val sigWidth = self.getWidth + + def in_to_float(x: SInt) = { + val in_to_rec_fn = Module(new INToRecFN(intWidth = self.getWidth, expWidth, sigWidth)) + in_to_rec_fn.io.signedIn := true.B + in_to_rec_fn.io.in := x.asUInt() + in_to_rec_fn.io.roundingMode := consts.round_minMag // consts.round_near_maxMag + in_to_rec_fn.io.detectTininess := consts.tininess_afterRounding + + in_to_rec_fn.io.out + } + + def float_to_in(x: UInt) = { + val rec_fn_to_in = Module(new RecFNToIN(expWidth = expWidth, sigWidth, self.getWidth)) + rec_fn_to_in.io.signedOut := true.B + rec_fn_to_in.io.in := x + rec_fn_to_in.io.roundingMode := consts.round_minMag // consts.round_near_maxMag + + rec_fn_to_in.io.out.asSInt() + } + + val self_rec = in_to_float(self) + + // Instantiate the hardloat sqrt + val sqrter = Module(new DivSqrtRecFN_small(expWidth, sigWidth, 0)) + + input.ready := sqrter.io.inReady + sqrter.io.inValid := input.valid + sqrter.io.sqrtOp := true.B + sqrter.io.a := self_rec + sqrter.io.b := DontCare + sqrter.io.roundingMode := consts.round_minMag + sqrter.io.detectTininess := consts.tininess_afterRounding + + output.valid := sqrter.io.outValid_sqrt + output.bits := float_to_in(sqrter.io.out) + + assert(!output.valid || output.ready) + + Some((input, output)) + } + + override def reciprocal[U <: Data](u: U): Option[(DecoupledIO[UInt], DecoupledIO[U])] = u match { + case Float(expWidth, sigWidth) => + val input = Wire(Decoupled(UInt(0.W))) + val output = Wire(Decoupled(u.cloneType)) + + input.bits := DontCare + + // We translate our integer to floating-point form so that we can use the hardfloat divider + def in_to_float(x: SInt) = { + val in_to_rec_fn = Module(new INToRecFN(intWidth = self.getWidth, expWidth, sigWidth)) + in_to_rec_fn.io.signedIn := true.B + in_to_rec_fn.io.in := x.asUInt() + in_to_rec_fn.io.roundingMode := consts.round_near_even // consts.round_near_maxMag + in_to_rec_fn.io.detectTininess := consts.tininess_afterRounding + + in_to_rec_fn.io.out + } + + val self_rec = in_to_float(self) + val one_rec = in_to_float(1.S) + + // Instantiate the hardloat divider + val divider = Module(new DivSqrtRecFN_small(expWidth, sigWidth, 0)) + + input.ready := divider.io.inReady + divider.io.inValid := input.valid + divider.io.sqrtOp := false.B + divider.io.a := one_rec + divider.io.b := self_rec + divider.io.roundingMode := consts.round_near_even + divider.io.detectTininess := consts.tininess_afterRounding + + output.valid := divider.io.outValid_div + output.bits := fNFromRecFN(expWidth, sigWidth, divider.io.out).asTypeOf(u) + + assert(!output.valid || output.ready) + + Some((input, output)) + + case _ => None + } + + override def mult_with_reciprocal[U <: Data](reciprocal: U): SInt = reciprocal match { + case recip @ Float(expWidth, sigWidth) => + def in_to_float(x: SInt) = { + val in_to_rec_fn = Module(new INToRecFN(intWidth = self.getWidth, expWidth, sigWidth)) + in_to_rec_fn.io.signedIn := true.B + in_to_rec_fn.io.in := x.asUInt() + in_to_rec_fn.io.roundingMode := consts.round_near_even // consts.round_near_maxMag + in_to_rec_fn.io.detectTininess := consts.tininess_afterRounding + + in_to_rec_fn.io.out + } + + def float_to_in(x: UInt) = { + val rec_fn_to_in = Module(new RecFNToIN(expWidth = expWidth, sigWidth, self.getWidth)) + rec_fn_to_in.io.signedOut := true.B + rec_fn_to_in.io.in := x + rec_fn_to_in.io.roundingMode := consts.round_minMag + + rec_fn_to_in.io.out.asSInt() + } + + val self_rec = in_to_float(self) + val reciprocal_rec = recFNFromFN(expWidth, sigWidth, recip.bits) + + // Instantiate the hardloat divider + val muladder = Module(new MulAddRecFN(expWidth, sigWidth)) + muladder.io.op := 0.U + muladder.io.roundingMode := consts.round_near_even + muladder.io.detectTininess := consts.tininess_afterRounding + + muladder.io.a := self_rec + muladder.io.b := reciprocal_rec + muladder.io.c := 0.U + + float_to_in(muladder.io.out) + + case _ => self + } } } @@ -239,6 +432,12 @@ object Arithmetic { result } + override def -(t: Float): Float = { + val t_sgn = t.bits(t.getWidth-1) + val neg_t = Cat(~t_sgn, t.bits(t.getWidth-2,0)).asTypeOf(t) + self + neg_t + } + override def >>(u: UInt): Float = { // Recode self val self_rec = recFNFromFN(self.expWidth, self.sigWidth, self.bits) @@ -322,55 +521,9 @@ object Arithmetic { result } - override def relu6(shift: UInt): Float = { - // Get a constant 6 as a float - val in_to_rec_fn = Module(new INToRecFN(log2Up(6+1), self.expWidth, self.sigWidth)) - in_to_rec_fn.io.signedIn := false.B - in_to_rec_fn.io.in := 6.U - in_to_rec_fn.io.roundingMode := consts.round_near_even // consts.round_near_maxMag - in_to_rec_fn.io.detectTininess := consts.tininess_afterRounding - - val six_rec = in_to_rec_fn.io.out - - // Get 2^shift as a float - val shift_exp = self.bias.U(self.expWidth.W) + shift - val shift_fn = Cat(0.U(1.W), shift_exp, 0.U((self.sigWidth-1).W)) - val shift_rec = recFNFromFN(self.expWidth, self.sigWidth, shift_fn) - - // Get 6*(2^shift) as a float - val muladder = Module(new MulAddRecFN(self.expWidth, self.sigWidth)) - - muladder.io.op := 0.U - muladder.io.roundingMode := consts.round_near_even // consts.round_near_maxMag - muladder.io.detectTininess := consts.tininess_afterRounding - - muladder.io.a := six_rec - muladder.io.b := shift_rec - muladder.io.c := 0.U - - val shifted_rec = muladder.io.out - - // Now, compare self and 6*(2^shift) to calculate the activation function - val self_rec = recFNFromFN(self.expWidth, self.sigWidth, self.bits) - val self_raw = rawFloatFromFN(self.expWidth, self.sigWidth, self.bits) - - val comparer = Module(new CompareRecFN(self.expWidth, self.sigWidth)) - comparer.io.a := self_rec - comparer.io.b := shifted_rec - comparer.io.signaling := false.B - - val larger_than_six = comparer.io.gt - - val result_rec = Mux(!self_raw.isZero && self_raw.sign, 0.U, - Mux(larger_than_six, shifted_rec, self_rec)) - - val result = Wire(Float(self.expWidth, self.sigWidth)) - result.bits := fNFromRecFN(self.expWidth, self.sigWidth, result_rec) - result - } - override def zero: Float = 0.U.asTypeOf(self) override def identity: Float = Cat(0.U(2.W), ~(0.U((self.expWidth-1).W)), 0.U((self.sigWidth-1).W)).asTypeOf(self) + override def minimum: Float = Cat(1.U, ~(0.U(self.expWidth.W)), 0.U((self.sigWidth-1).W)).asTypeOf(self) } } @@ -379,14 +532,15 @@ object Arithmetic { override def *(t: DummySInt) = self.dontCare override def mac(m1: DummySInt, m2: DummySInt) = self.dontCare override def +(t: DummySInt) = self.dontCare + override def -(t: DummySInt) = self.dontCare override def >>(t: UInt) = self.dontCare override def >(t: DummySInt): Bool = false.B override def identity = self.dontCare override def withWidthOf(t: DummySInt) = self.dontCare override def clippedToWidthOf(t: DummySInt) = self.dontCare override def relu = self.dontCare - override def relu6(shift: UInt) = self.dontCare override def zero = self.dontCare + override def minimum: DummySInt = self.dontCare } } } diff --git a/src/main/scala/gemmini/Configs.scala b/src/main/scala/gemmini/Configs.scala index 7464dc61..2a060ea9 100644 --- a/src/main/scala/gemmini/Configs.scala +++ b/src/main/scala/gemmini/Configs.scala @@ -146,7 +146,7 @@ object GemminiConfigs { Mux(overflow, sat, rec_fn_to_in.io.out.asTypeOf(t)) }, - 1, Float(8, 24), -1, + 8, Float(8, 24), -1, identity = "1.0", c_str = "({float y = ROUND_NEAR_EVEN((x) * (scale)); y > INT8_MAX ? INT8_MAX : (y < INT8_MIN ? INT8_MIN : (acc_t)y);})" )), diff --git a/src/main/scala/gemmini/Controller.scala b/src/main/scala/gemmini/Controller.scala index cf572c0c..2c15d3ea 100644 --- a/src/main/scala/gemmini/Controller.scala +++ b/src/main/scala/gemmini/Controller.scala @@ -399,7 +399,6 @@ class GemminiModule[T <: Data: Arithmetic, U <: Data, V <: Data] unrolled_cmd.ready := true.B } } - } // Debugging signals diff --git a/src/main/scala/gemmini/CustomConfigs.scala b/src/main/scala/gemmini/CustomConfigs.scala index e1ed7199..9172e8ee 100644 --- a/src/main/scala/gemmini/CustomConfigs.scala +++ b/src/main/scala/gemmini/CustomConfigs.scala @@ -41,6 +41,14 @@ object GemminiCustomConfigs { acc_capacity = CapacityInKilobytes(128), ) + val bertInferenceConfig = defaultConfig.copy( + has_training_convs = false, + has_max_pool = false, + has_normalizations = true, + + acc_capacity = CapacityInKilobytes(128), + ) + // Specify which of your custom configs you want to build here val customConfig = baselineInferenceConfig } diff --git a/src/main/scala/gemmini/DMACommandTracker.scala b/src/main/scala/gemmini/DMACommandTracker.scala index 3390cbdf..9d4f71e6 100644 --- a/src/main/scala/gemmini/DMACommandTracker.scala +++ b/src/main/scala/gemmini/DMACommandTracker.scala @@ -20,7 +20,6 @@ class DMACommandTracker[T <: Data](val nCmds: Int, val maxBytes: Int, tag_t: => val tag = Input(tag_t.cloneType) val bytes_to_read = Input(UInt(log2Up(maxBytes+1).W)) val cmd_id = Output(cmd_id_t.cloneType) - } val bits = new BitsT(tag_t.cloneType, cmd_id_t.cloneType) diff --git a/src/main/scala/gemmini/ExecuteController.scala b/src/main/scala/gemmini/ExecuteController.scala index 62fc4495..514d918e 100644 --- a/src/main/scala/gemmini/ExecuteController.scala +++ b/src/main/scala/gemmini/ExecuteController.scala @@ -29,7 +29,7 @@ class ExecuteController[T <: Data, U <: Data, V <: Data](xLen: Int, tagWidth: In val acc = new Bundle { val read_req = Vec(acc_banks, Decoupled(new AccumulatorReadReq( - acc_bank_entries, log2Up(accType.getWidth), acc_scale_t + acc_bank_entries, accType, acc_scale_t ))) val read_resp = Flipped(Vec(acc_banks, Decoupled(new AccumulatorScaleResp( @@ -115,8 +115,7 @@ class ExecuteController[T <: Data, U <: Data, V <: Data](xLen: Int, tagWidth: In val in_shift = Reg(UInt(log2Up(accType.getWidth).W)) val acc_scale = Reg(acc_scale_t) - val relu6_shift = Reg(UInt(log2Up(accType.getWidth).W)) - val activation = if (has_nonlinear_activations) Reg(UInt(2.W)) else Activation.NONE // TODO magic number + val activation = if (has_nonlinear_activations) Reg(UInt(Activation.bitwidth.W)) else Activation.NONE // TODO magic number val a_transpose = Reg(Bool()) val bd_transpose = Reg(Bool()) val config_initialized = RegInit(false.B) @@ -470,7 +469,10 @@ class ExecuteController[T <: Data, U <: Data, V <: Data](xLen: Int, tagWidth: In io.acc.read_req(i).valid := read_a_from_acc || read_b_from_acc || read_d_from_acc io.acc.read_req(i).bits.scale := acc_scale io.acc.read_req(i).bits.full := false.B - io.acc.read_req(i).bits.relu6_shift := relu6_shift + io.acc.read_req(i).bits.igelu_qb := DontCare + io.acc.read_req(i).bits.igelu_qc := DontCare + io.acc.read_req(i).bits.iexp_qln2 := DontCare + io.acc.read_req(i).bits.iexp_qln2_inv := DontCare io.acc.read_req(i).bits.act := activation io.acc.read_req(i).bits.fromDMA := false.B io.acc.read_req(i).bits.addr := MuxCase(a_address_rs1.acc_row() + a_fire_counter, @@ -487,7 +489,10 @@ class ExecuteController[T <: Data, U <: Data, V <: Data](xLen: Int, tagWidth: In io.acc.read_req(i).valid := false.B io.acc.read_req(i).bits.scale := DontCare io.acc.read_req(i).bits.full := false.B - io.acc.read_req(i).bits.relu6_shift := relu6_shift + io.acc.read_req(i).bits.igelu_qb := DontCare + io.acc.read_req(i).bits.igelu_qc := DontCare + io.acc.read_req(i).bits.iexp_qln2 := DontCare + io.acc.read_req(i).bits.iexp_qln2_inv := DontCare io.acc.read_req(i).bits.act := DontCare io.acc.read_req(i).bits.fromDMA := false.B io.acc.read_req(i).bits.addr := DontCare @@ -550,7 +555,6 @@ class ExecuteController[T <: Data, U <: Data, V <: Data](xLen: Int, tagWidth: In } in_shift := config_ex_rs2.in_shift acc_scale := rs1s(0)(xLen - 1, 32).asTypeOf(acc_scale_t) // TODO magic number - relu6_shift := config_ex_rs2.relu6_shift a_transpose := config_ex_rs1.a_transpose bd_transpose := config_ex_rs1.b_transpose @@ -614,7 +618,6 @@ class ExecuteController[T <: Data, U <: Data, V <: Data](xLen: Int, tagWidth: In start_inputting_a := !a_should_be_fed_into_transposer start_inputting_b := !b_should_be_fed_into_transposer - start_inputting_b := true.B control_state := compute } @@ -924,8 +927,7 @@ class ExecuteController[T <: Data, U <: Data, V <: Data](xLen: Int, tagWidth: In val activated_wdata = VecInit(mesh.io.resp.bits.data.map(v => VecInit(v.map { e => val e_clipped = e.clippedToWidthOf(inputType) val e_act = MuxCase(e_clipped, Seq( - (activation === Activation.RELU) -> e_clipped.relu, - (activation === Activation.RELU6) -> e_clipped.relu6(relu6_shift))) + (activation === Activation.RELU) -> e_clipped.relu)) e_act }))) diff --git a/src/main/scala/gemmini/GemminiConfigs.scala b/src/main/scala/gemmini/GemminiConfigs.scala index 40c1c777..573581ec 100644 --- a/src/main/scala/gemmini/GemminiConfigs.scala +++ b/src/main/scala/gemmini/GemminiConfigs.scala @@ -86,7 +86,7 @@ case class GemminiArrayConfig[T <: Data : Arithmetic, U <: Data, V <: Data]( has_max_pool: Boolean = true, has_nonlinear_activations: Boolean = true, has_dw_convs: Boolean = true, - + has_normalizations: Boolean = false, has_first_layer_optimizations: Boolean = true, use_firesim_simulation_counters: Boolean = false, @@ -492,6 +492,11 @@ case class GemminiArrayConfig[T <: Data : Arithmetic, U <: Data, V <: Data]( header ++= "#define HAS_FIRST_LAYER_OPTIMIZATIONS\n\n" } + if (has_normalizations) { + header ++= "#define HAS_NORMALIZATIONS\n" + header ++= "#define NORM_STAT_IDS 4\n\n" + } + header ++= s"#endif // $guard\n" header.toString() } diff --git a/src/main/scala/gemmini/GemminiISA.scala b/src/main/scala/gemmini/GemminiISA.scala index ea3aed12..7bca089b 100644 --- a/src/main/scala/gemmini/GemminiISA.scala +++ b/src/main/scala/gemmini/GemminiISA.scala @@ -38,7 +38,7 @@ object GemminiISA { val CONFIG_EX = 0.U val CONFIG_LOAD = 1.U val CONFIG_STORE = 2.U - val CONFIG_IM2COL = 3.U + val CONFIG_NORM = 3.U //========================================================================== // cisc-gemmini opcodes @@ -107,7 +107,7 @@ object GemminiISA { val _unused = UInt(CONFIG_MVIN_RS1_UNUSED_WIDTH.W) } - val CONFIG_MVOUT_RS1_UNUSED_WIDTH = 2 + val CONFIG_MVOUT_RS1_CMD_TYPE_WIDTH = 2 val CONFIG_MVOUT_RS1_ACTIVATION_WIDTH = 2 val CONFIG_MVOUT_RS1_MAX_POOLING_STRIDE_WIDTH = 2 val CONFIG_MVOUT_RS1_MAX_POOLING_WINDOW_SIZE_WIDTH = 2 @@ -132,7 +132,7 @@ object GemminiISA { val pool_size = UInt(CONFIG_MVOUT_RS1_MAX_POOLING_WINDOW_SIZE_WIDTH.W) val pool_stride = UInt(CONFIG_MVOUT_RS1_MAX_POOLING_STRIDE_WIDTH.W) val activation = UInt(CONFIG_MVOUT_RS1_ACTIVATION_WIDTH.W) - val _unused = UInt(CONFIG_MVOUT_RS1_UNUSED_WIDTH.W) + val cmd_type = UInt(CONFIG_MVOUT_RS1_CMD_TYPE_WIDTH.W) } val CONFIG_MVOUT_RS2_ACC_SCALE_WIDTH = 32 @@ -145,6 +145,36 @@ object GemminiISA { val stride = UInt(stride_bits.W) } + val CONFIG_NORM_RS1_Q_CONST_WIDTH = 32 + val CONFIG_NORM_RS1_SPACER1_WIDTH = 13 + val CONFIG_NORM_RS1_Q_CONST_TYPE_WIDTH = 1 + val CONFIG_NORM_RS1_SET_STATS_ID_ONLY_WIDTH = 1 + val CONFIG_NORM_RS1_ACT_MSB_WIDTH = 1 + val CONFIG_NORM_RS1_NORM_STATS_ID_WIDTH = 8 + val CONFIG_NORM_RS1_SPACER0_WIDTH = 6 + val CONFIG_NORM_RS1_CMD_TYPE_WIDTH = 2 + + class ConfigNormRs1(acc_t_bits: Int = 32) extends Bundle { + val q_const = UInt(acc_t_bits.W) + val _spacer1 = UInt(CONFIG_NORM_RS1_SPACER1_WIDTH.W) + val q_const_type = UInt(CONFIG_NORM_RS1_Q_CONST_TYPE_WIDTH.W) + val set_stats_id_only = UInt(CONFIG_NORM_RS1_SET_STATS_ID_ONLY_WIDTH.W) + val act_msb = UInt(CONFIG_NORM_RS1_ACT_MSB_WIDTH.W) + val norm_stats_id = UInt(CONFIG_NORM_RS1_NORM_STATS_ID_WIDTH.W) + val _spacer0 = UInt(CONFIG_NORM_RS1_SPACER0_WIDTH.W) + val cmd_type = UInt(CONFIG_NORM_RS1_CMD_TYPE_WIDTH.W) + } + + val CONFIG_NORM_RS2_QC_WIDTH = 32 + val CONFIG_NORM_RS2_QB_WIDTH = 32 + + class ConfigNormRs2(acc_t_bits: Int) extends Bundle { + val _spacer1 = UInt((CONFIG_NORM_RS2_QC_WIDTH - acc_t_bits).W) + val qc = UInt(acc_t_bits.W) + val _spacer0 = UInt((CONFIG_NORM_RS2_QB_WIDTH - acc_t_bits).W) + val qb = UInt(acc_t_bits.W) + } + val CONFIG_EX_RS1_CMD_TYPE_WIDTH = 2 val CONFIG_EX_RS1_DATAFLOW_WIDTH = 1 val CONFIG_EX_RS1_ACTIVATION_WIDTH = 2 diff --git a/src/main/scala/gemmini/LocalAddr.scala b/src/main/scala/gemmini/LocalAddr.scala index 92e46ffc..b53addea 100644 --- a/src/main/scala/gemmini/LocalAddr.scala +++ b/src/main/scala/gemmini/LocalAddr.scala @@ -21,8 +21,13 @@ class LocalAddr(sp_banks: Int, sp_bank_entries: Int, acc_banks: Int, acc_bank_en val is_acc_addr = Bool() val accumulate = Bool() val read_full_acc_row = Bool() - val garbage = UInt(((localAddrBits - maxAddrBits - 4) max 0).W) - val garbage_bit = if (localAddrBits - maxAddrBits >= 4) UInt(1.W) else UInt(0.W) + val norm_cmd = NormCmd() + + private val metadata_w = is_acc_addr.getWidth + accumulate.getWidth + read_full_acc_row.getWidth + norm_cmd.getWidth + assert(maxAddrBits + metadata_w < 32) + + val garbage = UInt(((localAddrBits - maxAddrBits - metadata_w - 1) max 0).W) + val garbage_bit = if (localAddrBits - maxAddrBits >= metadata_w + 1) UInt(1.W) else UInt(0.W) val data = UInt(maxAddrBits.W) def sp_bank(dummy: Int = 0) = if (spAddrBits == spBankRowBits) 0.U else data(spAddrBits - 1, spBankRowBits) diff --git a/src/main/scala/gemmini/LoopConv.scala b/src/main/scala/gemmini/LoopConv.scala index 3d7a099e..3fe28e98 100644 --- a/src/main/scala/gemmini/LoopConv.scala +++ b/src/main/scala/gemmini/LoopConv.scala @@ -922,7 +922,7 @@ class LoopConvSt(block_size: Int, coreMaxAddrBits: Int, large_iterator_bitwidth: pre_pool_config_cmd_rs1.pool_size := pool_size pre_pool_config_cmd_rs1.pool_stride := pool_stride pre_pool_config_cmd_rs1.activation := req.activation - pre_pool_config_cmd_rs1._unused := CONFIG_STORE + pre_pool_config_cmd_rs1.cmd_type := CONFIG_STORE pre_pool_config_cmd.rs1 := pre_pool_config_cmd_rs1.asUInt() val pre_pool_config_cmd_rs2 = Wire(config_mvout_rs2_t.cloneType) @@ -938,7 +938,7 @@ class LoopConvSt(block_size: Int, coreMaxAddrBits: Int, large_iterator_bitwidth: val post_pool_config_cmd_rs1 = Wire(new ConfigMvoutRs1) post_pool_config_cmd_rs1 := DontCare post_pool_config_cmd_rs1.activation := req.activation - post_pool_config_cmd_rs1._unused := CONFIG_STORE + post_pool_config_cmd_rs1.cmd_type := CONFIG_STORE post_pool_config_cmd.rs1 := post_pool_config_cmd_rs1.asUInt() val post_pool_config_cmd_rs2 = Wire(config_mvout_rs2_t.cloneType) diff --git a/src/main/scala/gemmini/LoopMatmul.scala b/src/main/scala/gemmini/LoopMatmul.scala index 52871276..5f564000 100644 --- a/src/main/scala/gemmini/LoopMatmul.scala +++ b/src/main/scala/gemmini/LoopMatmul.scala @@ -488,6 +488,7 @@ class LoopMatmulStCReq(val block_size: Int, val coreMaxAddrBits: Int, val iterat val dram_addr = UInt(coreMaxAddrBits.W) val dram_stride = UInt(coreMaxAddrBits.W) val full_c = Bool() + val act = UInt(Activation.bitwidth.W) val addr_start = UInt(log2Up(max_acc_addr).W) val loop_id = UInt(log2Up(concurrent_loops).W) } @@ -513,7 +514,7 @@ class LoopMatmulStC(block_size: Int, coreMaxAddrBits: Int, iterator_bitwidth: In }) object State extends ChiselEnum { - val idle, st = Value + val idle, st, ln_config, ln_st = Value } import State._ val state = RegInit(idle) @@ -522,6 +523,7 @@ class LoopMatmulStC(block_size: Int, coreMaxAddrBits: Int, iterator_bitwidth: In val max_blocks = Mux(req.full_c, 1.U, Mux(req.max_j <= max_block_len.U, req.max_j, max_block_len.U)) + // Non-normalization-related iterators and calculations val j = Reg(UInt(iterator_bitwidth.W)) val i = Reg(UInt(iterator_bitwidth.W)) @@ -547,26 +549,80 @@ class LoopMatmulStC(block_size: Int, coreMaxAddrBits: Int, iterator_bitwidth: In mvout_cmd_rs2.local_addr := cast_to_acc_addr(mvout_cmd_rs2.local_addr, sp_addr, accumulate = false.B, read_full = req.full_c) mvout_cmd.rs2 := mvout_cmd_rs2.asUInt() + // Layernorm iterators and calculations + val ln_row = Reg(UInt(iterator_bitwidth.W)) + val ln_cmd = Reg(UInt(iterator_bitwidth.W)) + val ln_stat_id = Reg(UInt(iterator_bitwidth.W)) + + val NORM_STAT_IDS = 4 // TODO magic number + + val ln_norm_cmds = VecInit(VecInit(NormCmd.SUM, NormCmd.MEAN), VecInit(NormCmd.VARIANCE, NormCmd.INV_STDDEV), + VecInit(NormCmd.RESET, NormCmd.RESET)) + + val sm_norm_cmds = VecInit(VecInit(NormCmd.MAX, NormCmd.MAX), VecInit(NormCmd.SUM_EXP, NormCmd.INV_SUM_EXP), + VecInit(NormCmd.RESET, NormCmd.RESET)) + + val ln_stat_ids = Mux(rows -& ln_row > NORM_STAT_IDS.U, NORM_STAT_IDS.U, rows -& ln_row) + + val ln_r = ln_row +& ln_stat_id + + val ln_sp_addr = acc_addr_start +& (i * req.max_j +& j) * block_size.U +& ln_r + val ln_norm_cmd = Mux(j +& max_blocks >= req.max_j, + Mux(req.act === Activation.LAYERNORM, ln_norm_cmds(ln_cmd)(1), sm_norm_cmds(ln_cmd)(1)), + Mux(req.act === Activation.LAYERNORM, ln_norm_cmds(ln_cmd)(0), sm_norm_cmds(ln_cmd)(0))) + + // TODO we assume for now that full_C and layernorm aren't true at the same + val ln_dram_offset = ((i * req.dram_stride +& j) * block_size.U +& ln_r * req.dram_stride) * (input_w/8).U + val ln_dram_addr = req.dram_addr + LoopMatmul.castDramOffset(ln_dram_offset) + + val ln_config_norm_rs1 = Wire(new GemminiISA.ConfigNormRs1) + ln_config_norm_rs1 := DontCare + ln_config_norm_rs1.set_stats_id_only := 1.U + ln_config_norm_rs1.cmd_type := CONFIG_NORM + ln_config_norm_rs1.norm_stats_id := ln_stat_id + + val ln_config_norm = Wire(new RoCCCommand) + ln_config_norm := DontCare + ln_config_norm.inst.funct := CONFIG_CMD + ln_config_norm.rs1 := ln_config_norm_rs1.asUInt() + ln_config_norm.rs2 := DontCare + + val ln_mvout_cmd = Wire(new RoCCCommand) + ln_mvout_cmd := DontCare + ln_mvout_cmd.inst.funct := STORE_CMD + ln_mvout_cmd.rs1 := ln_dram_addr + + val ln_mvout_cmd_rs2 = Wire(mvout_rs2_t.cloneType) + ln_mvout_cmd_rs2 := DontCare + ln_mvout_cmd_rs2.num_rows := 1.U + ln_mvout_cmd_rs2.num_cols := cols.asUInt() + ln_mvout_cmd_rs2.local_addr := cast_to_acc_addr(ln_mvout_cmd_rs2.local_addr, ln_sp_addr, accumulate = false.B, read_full = req.full_c) + ln_mvout_cmd_rs2.local_addr.norm_cmd := ln_norm_cmd + ln_mvout_cmd.rs2 := ln_mvout_cmd_rs2.asUInt() + io.req.ready := state === idle io.j := j io.i := i io.idle := state === idle - // The order here is k, j, i - // val ex_ahead = io.ex_completed || (io.ex_k === req.max_k - 1.U && (io.ex_j > j || (io.ex_j === j && io.ex_i > i))) + // The order here is k, j, i when not doing LAYERNORM or SOFTMAX val ex_ahead = io.ex_completed || - (io.ex_k === req.max_k - 1.U && - (io.ex_j >= j + blocks || - ((io.ex_j === j + blocks - 1.U) && io.ex_i > i))) + ((req.act =/= Activation.LAYERNORM) && (req.act =/= Activation.SOFTMAX) && + (io.ex_k === req.max_k - 1.U && + (io.ex_j >= j + blocks || + ((io.ex_j === j + blocks - 1.U) && io.ex_i > i)))) io.cmd.valid := state =/= idle && !io.rob_overloaded && ex_ahead && req.dram_addr =/= 0.U - io.cmd.bits := mvout_cmd + io.cmd.bits := MuxCase(mvout_cmd, Seq( + (state === ln_config) -> ln_config_norm, + (state === ln_st) -> ln_mvout_cmd, + )) io.loop_id := req.loop_id when (req.dram_addr === 0.U) { state := idle - }.elsewhen (io.cmd.fire) { + }.elsewhen (io.cmd.fire() && state === st) { // The order here is k, j, i val next_i = floorAdd(i, 1.U, req.max_i) val next_j = floorAdd(j, max_blocks, req.max_j, next_i === 0.U) @@ -577,13 +633,38 @@ class LoopMatmulStC(block_size: Int, coreMaxAddrBits: Int, iterator_bitwidth: In when (next_i === 0.U && next_j === 0.U) { state := idle } + }.elsewhen (io.cmd.fire() && state === ln_config) { + state := ln_st + }.elsewhen (io.cmd.fire() && state === ln_st) { + val next_j = floorAdd(j, max_blocks, req.max_j) + val next_stat_id = floorAdd(ln_stat_id, 1.U, ln_stat_ids, next_j === 0.U) + val next_cmd = floorAdd(ln_cmd, 1.U, ln_norm_cmds.size.U, next_j === 0.U && next_stat_id === 0.U) + val next_row = floorAdd(ln_row, NORM_STAT_IDS.U, rows, next_j === 0.U && next_stat_id === 0.U && next_cmd === 0.U) + val next_i = floorAdd(i, 1.U, req.max_i, + next_j === 0.U && next_stat_id === 0.U && next_cmd === 0.U && next_row === 0.U) + + j := next_j + ln_stat_id := next_stat_id + ln_cmd := next_cmd + ln_row := next_row + i := next_i + + when (next_i === 0.U && next_row === 0.U && next_cmd === 0.U && next_stat_id === 0.U && next_j === 0.U) { + state := idle + }.elsewhen (next_j === 0.U) { + state := ln_config + } } when (io.req.fire) { req := io.req.bits - state := st + state := Mux((io.req.bits.act === Activation.LAYERNORM) || (io.req.bits.act === Activation.SOFTMAX), ln_config, st) + j := 0.U i := 0.U + ln_row := 0.U + ln_cmd := 0.U + ln_stat_id := 0.U } } @@ -610,12 +691,12 @@ class LoopMatmulState(val iterator_bitwidth: Int, val coreMaxAddrBits: Int, val val a_transpose = Bool() val b_transpose = Bool() + val act = UInt(Activation.bitwidth.W) + val low_d = Bool() val full_c = Bool() val ex_accumulate = Bool() - val weightA = UInt(8.W) // TODO magic numbers - val configured = Bool() val running = Bool() @@ -706,7 +787,7 @@ class LoopMatmul(block_size: Int, coreMaxAddrBits: Int, reservation_station_size val ab_loads_on_same_loop = ldA.io.loop_id === ldB.io.loop_id ldab_arb.io.forceA := !ab_loads_on_same_loop && ldA.io.loop_id === head_loop_id ldab_arb.io.forceB := !ab_loads_on_same_loop && ldB.io.loop_id === head_loop_id - ldab_arb.io.weightA := head_loop.weightA + ldab_arb.io.weightA := 0.U ldab_arb.io.inA_idle := ldA.io.idle ldab_arb.io.inB_idle := ldB.io.idle ldab_arb.io.inA_k := ldA.io.k @@ -812,11 +893,11 @@ class LoopMatmul(block_size: Int, coreMaxAddrBits: Int, reservation_station_size loop_being_configured.ex_accumulate := cmd.bits.cmd.rs1(0) loop_being_configured.full_c := cmd.bits.cmd.rs1(1) loop_being_configured.low_d := cmd.bits.cmd.rs1(2) + loop_being_configured.act := cmd.bits.cmd.rs1(8+Activation.bitwidth-1, 8) // TODO magic numbers + loop_being_configured.a_transpose := cmd.bits.cmd.rs2(0) loop_being_configured.b_transpose := cmd.bits.cmd.rs2(1) - loop_being_configured.weightA := cmd.bits.cmd.rs1(15, 8) // TODO magic numbers - loop_being_configured.configured := true.B loops_configured := loops_configured + 1.U @@ -928,6 +1009,7 @@ class LoopMatmul(block_size: Int, coreMaxAddrBits: Int, reservation_station_size stC.io.req.bits.dram_addr := loop_requesting_st.c_dram_addr stC.io.req.bits.dram_stride := loop_requesting_st.c_dram_stride stC.io.req.bits.full_c := loop_requesting_st.full_c + stC.io.req.bits.act := loop_requesting_st.act stC.io.req.bits.addr_start := st_c_addr_start stC.io.req.bits.loop_id := loop_requesting_st_id diff --git a/src/main/scala/gemmini/NormCmd.scala b/src/main/scala/gemmini/NormCmd.scala new file mode 100644 index 00000000..515fabb0 --- /dev/null +++ b/src/main/scala/gemmini/NormCmd.scala @@ -0,0 +1,23 @@ + +package gemmini + +import chisel3._ +import chisel3.util._ +import chisel3.experimental.ChiselEnum + +object NormCmd extends ChiselEnum { + val RESET, SUM, MEAN, VARIANCE, INV_STDDEV, MAX, SUM_EXP, INV_SUM_EXP = Value + + def writes_to_main_memory(cmd: Type): Bool = { + cmd === RESET + } + + def non_reset_version(cmd: Type): Type = { + MuxCase(cmd, Seq( + (cmd === MEAN) -> SUM, + (cmd === MAX) -> MAX, + (cmd === INV_STDDEV) -> VARIANCE, + (cmd === INV_SUM_EXP) -> SUM_EXP + )) + } +} diff --git a/src/main/scala/gemmini/Normalizer.scala b/src/main/scala/gemmini/Normalizer.scala new file mode 100644 index 00000000..89dca2db --- /dev/null +++ b/src/main/scala/gemmini/Normalizer.scala @@ -0,0 +1,635 @@ + +package gemmini + +import chisel3._ +import chisel3.experimental.ChiselEnum +import chisel3.util._ +import gemmini.AccumulatorScale.iexp +import hardfloat.{DivSqrtRecFN_small, INToRecFN, consts, fNFromRecFN} + +class NormalizedInput[T <: Data: Arithmetic, U <: Data](max_len: Int, num_stats: Int, fullDataType: Vec[Vec[T]], + scale_t: U) extends Bundle { + val acc_read_resp = new AccumulatorReadResp[T,U](fullDataType, scale_t) + val len = UInt(log2Up(max_len + 1).W) + val stats_id = UInt(log2Up(num_stats).W) + val cmd = NormCmd() +} + +class NormalizedOutput[T <: Data: Arithmetic, U <: Data](fullDataType: Vec[Vec[T]], scale_t: U) extends Bundle { + val acc_read_resp = new AccumulatorReadResp[T,U](fullDataType, scale_t) + val mean = fullDataType.head.head.cloneType + val max = fullDataType.head.head.cloneType + val inv_stddev = scale_t.cloneType + val inv_sum_exp = scale_t.cloneType +} + +class IExpConst[T <: Data](acc_t: T) extends Bundle { + val qb = acc_t.cloneType + val qc = acc_t.cloneType + val qln2 = acc_t.cloneType + val qln2_inv = acc_t.cloneType +} + +class AccumulationLanes[T <: Data](num_stats: Int, acc_t: T, n_lanes: Int, latency: Int)(implicit ev: Arithmetic[T]) + extends Module { + // Each lane computes a sum, or an error-squared sum + + import ev._ + + class LaneOutput extends Bundle { + val result = acc_t.cloneType + val stats_id = UInt(log2Up(num_stats).W) + } + + val io = IO(new Bundle { + val ins = Flipped(Valid(new Bundle { + val len = UInt(log2Up(n_lanes+1).W) + val data = Vec(n_lanes, acc_t) + val mean = acc_t.cloneType + val max = acc_t.cloneType + val iexp_const = new IExpConst(acc_t) + val cmd = NormCmd() + val stats_id = UInt(log2Up(num_stats).W) + })) + + val out = Valid(new LaneOutput) + + val busy = Output(Bool()) + }) + + val cmd = io.ins.bits.cmd + val mean = io.ins.bits.mean + val iexp_c = io.ins.bits.iexp_const + + val data = io.ins.bits.data.zipWithIndex.map { case (d, i) => + val iexp_result = iexp(d - io.ins.bits.max, iexp_c.qln2, iexp_c.qln2_inv, iexp_c.qb, iexp_c.qc) + Mux(i.U < io.ins.bits.len, + MuxCase(d, Seq( + (cmd === NormCmd.VARIANCE || cmd === NormCmd.INV_STDDEV) -> (d-mean)*(d-mean), + (cmd === NormCmd.SUM_EXP || cmd === NormCmd.INV_SUM_EXP) -> + iexp_result //iexp(d - io.ins.bits.max, iexp_c.qln2, iexp_c.qln2_inv, iexp_c.qb, iexp_c.qc) + )).withWidthOf(acc_t), + d.zero) + } + + val result = data.reduce(_ + _) + + val pipe = Module(new Pipeline[LaneOutput](new LaneOutput, latency)()) + + pipe.io.in.valid := io.ins.valid + // io.ins.ready := pipe.io.in.ready + pipe.io.in.bits.result := result + pipe.io.in.bits.stats_id := io.ins.bits.stats_id + + io.out.valid := pipe.io.out.valid + pipe.io.out.ready := true.B + // pipe.io.out.ready := io.out.ready + io.out.bits := pipe.io.out.bits + + io.busy := pipe.io.busy +} + +class MaxLanes[T <: Data](num_stats: Int, acc_t: T, n_lanes: Int, latency: Int)(implicit ev: Arithmetic[T]) + extends Module { + // Each lane computes a sum, or an error-squared sum + + import ev._ + import NormCmd._ + + class LaneOutput extends Bundle { + val result = acc_t.cloneType + val stats_id = UInt(log2Up(num_stats).W) + } + + val io = IO(new Bundle { + val ins = Flipped(Valid(new Bundle { + val len = UInt(log2Up(n_lanes + 1).W) + val data = Vec(n_lanes, acc_t) + val stats_id = UInt(log2Up(num_stats).W) + })) + + val out = Valid(new LaneOutput) + + val busy = Output(Bool()) + }) + + val data = io.ins.bits.data.zipWithIndex.map { case (d, i) => + Mux(i.U < io.ins.bits.len, d.withWidthOf(acc_t), d.minimum) + } + + val result = data.reduce({ (max, x) => Mux(x > max, x, max) }) + + val pipe = Module(new Pipeline[LaneOutput](new LaneOutput, latency)()) + + pipe.io.in.valid := io.ins.valid + // io.ins.ready := pipe.io.in.ready + pipe.io.in.bits.result := result + pipe.io.in.bits.stats_id := io.ins.bits.stats_id + + io.out.valid := pipe.io.out.valid + pipe.io.out.ready := true.B + // pipe.io.out.ready := io.out.ready + io.out.bits := pipe.io.out.bits + + io.busy := pipe.io.busy +} + +class Normalizer[T <: Data, U <: Data](max_len: Int, num_reduce_lanes: Int, num_stats: Int, latency: Int, + fullDataType: Vec[Vec[T]], scale_t: U) + (implicit ev: Arithmetic[T]) extends Module { + import ev._ + val acc_t = fullDataType.head.head.cloneType + val vec_size = fullDataType.flatten.size + val n_lanes = if (num_reduce_lanes < 0) vec_size else num_reduce_lanes + + assert(isPow2(n_lanes)) + + val io = IO(new Bundle { + val in = Flipped(Decoupled(new NormalizedInput[T,U](max_len, num_stats, fullDataType, scale_t))) + val out = Decoupled(new NormalizedOutput(fullDataType, scale_t)) + }) + + object State extends ChiselEnum { + // NOTE: We assume that "idle" and "output" are the first two states. We also assume that all the enums on the same + // line keep the order below + val idle, output = Value + val get_sum = Value + val get_mean, waiting_for_mean = Value + val get_variance, waiting_for_variance, get_stddev, waiting_for_stddev, get_inv_stddev, waiting_for_inv_stddev = Value + val get_max = Value + val get_inv_sum_exp, waiting_for_inv_sum_exp = Value + } + import State._ + + // Buffers for normalization stats + class Stats extends Bundle { + val req = new NormalizedInput[T,U](max_len, num_stats, fullDataType, scale_t) + val state = State() + + // Running state + val sum = acc_t.cloneType + val count = UInt(16.W) // TODO magic number + val running_max = acc_t.cloneType + val max = acc_t.cloneType + + // Iterative state + val mean = acc_t.cloneType + val inv_stddev = acc_t.cloneType + val inv_sum_exp = acc_t.cloneType + + val elems_left = req.len.cloneType + + def vec_grouped = VecInit(req.acc_read_resp.data.flatten.grouped(n_lanes).map(v => VecInit(v)).toSeq) + def vec_groups_left = elems_left / n_lanes.U + (elems_left % n_lanes.U =/= 0.U) + + def cmd = req.cmd + + def waiting_for_lanes_to_drain = + (cmd === NormCmd.MEAN && (state === get_sum || state === get_mean)) || + (cmd === NormCmd.INV_STDDEV && (state === get_sum || state === get_variance)) || + (cmd === NormCmd.MAX && (state === get_max)) || + (cmd === NormCmd.INV_SUM_EXP && (state === get_sum)) + } + + val stats = Reg(Vec(num_stats, new Stats)) + val done_with_functional_units = Wire(Vec(num_stats, Bool())) + val next_states = Wire(Vec(num_stats, State())) + + (stats.map(_.state) zip next_states).foreach { case (s, ns) => s := ns } + + // IO + val in_stats_id = io.in.bits.stats_id + io.in.ready := (stats(in_stats_id).state === idle || done_with_functional_units(in_stats_id)) && + stats.map(!_.waiting_for_lanes_to_drain).reduce(_ && _) + + val out_stats_id = MuxCase((num_stats-1).U, + stats.zipWithIndex.map { case (s,i) => (s.state === output) -> i.U } + ) + + io.out.valid := stats(out_stats_id).state === output + io.out.bits.acc_read_resp := stats(out_stats_id).req.acc_read_resp + io.out.bits.mean := stats(out_stats_id).mean + io.out.bits.max := stats(out_stats_id).max + io.out.bits.inv_stddev := stats(out_stats_id).inv_stddev.asTypeOf(scale_t) + io.out.bits.inv_sum_exp := stats(out_stats_id).inv_sum_exp.asTypeOf(scale_t) + + // Lanes and functional units + val lanes = Module(new AccumulationLanes(num_stats, acc_t, n_lanes, latency)) + val max_lanes = Module(new MaxLanes(num_stats, acc_t, n_lanes, latency)) // TODO: change latency? + + { + // Lanes input + val in_lanes_stats_id = MuxCase((num_stats-1).U, + stats.zipWithIndex.map { case (s,i) => (s.state === get_sum) -> i.U } + ) + + val stat = stats(in_lanes_stats_id) + + val len = Mux(stat.elems_left % n_lanes.U === 0.U, n_lanes.U, stat.elems_left % n_lanes.U) + + lanes.io.ins.valid := stat.state === get_sum && stat.vec_groups_left > 0.U + lanes.io.ins.bits.data := stat.vec_grouped(stat.vec_groups_left-1.U) + lanes.io.ins.bits.mean := stat.mean + lanes.io.ins.bits.max := stat.max + + val iexp_const = Wire(new IExpConst(acc_t)) + iexp_const.qln2 := io.in.bits.acc_read_resp.iexp_qln2.asTypeOf(iexp_const.qln2) + iexp_const.qln2_inv := io.in.bits.acc_read_resp.iexp_qln2_inv.asTypeOf(iexp_const.qln2_inv) + iexp_const.qb := io.in.bits.acc_read_resp.igelu_qb.asTypeOf(iexp_const.qb) + iexp_const.qc := io.in.bits.acc_read_resp.igelu_qc.asTypeOf(iexp_const.qc) + + lanes.io.ins.bits.cmd := stat.cmd + lanes.io.ins.bits.len := len + lanes.io.ins.bits.stats_id := in_lanes_stats_id + lanes.io.ins.bits.iexp_const := iexp_const + + when (lanes.io.ins.fire()) { + stat.elems_left := stat.elems_left - len + } + } + + { + // Lanes output + val out_lanes_stats_id = lanes.io.out.bits.stats_id + + val stat = stats(out_lanes_stats_id) + + when (lanes.io.out.fire()) { + stat.sum := stat.sum + lanes.io.out.bits.result + } + } + + { + // Max lanes input + val max_in_lanes_stats_id = MuxCase((num_stats-1).U, + stats.zipWithIndex.map { case (s,i) => (s.state === get_max) -> i.U } + ) + + val stat = stats(max_in_lanes_stats_id) + + val len = Mux(stat.elems_left % n_lanes.U === 0.U, n_lanes.U, stat.elems_left % n_lanes.U) + + max_lanes.io.ins.valid := stat.state === get_max && stat.vec_groups_left > 0.U + max_lanes.io.ins.bits.data := stat.vec_grouped(stat.vec_groups_left-1.U) + max_lanes.io.ins.bits.len := len + max_lanes.io.ins.bits.stats_id := max_in_lanes_stats_id + + when (max_lanes.io.ins.fire()) { + stat.elems_left := stat.elems_left - len + } + } + + { + // Max lanes output + val max_out_lanes_stats_id = max_lanes.io.out.bits.stats_id + + val stat = stats(max_out_lanes_stats_id) + + when (max_lanes.io.out.fire()) { + stat.running_max := Mux(max_lanes.io.out.bits.result > stat.running_max, max_lanes.io.out.bits.result, stat.running_max) + //stat.max := Mux(max_lanes.io.out.bits.result > stat.max, max_lanes.io.out.bits.result, stat.max) + } + } + + val sum_to_divide_id = MuxCase((num_stats-1).U, + stats.zipWithIndex.map { case (s,i) => + (s.state === get_mean || s.state === get_variance) -> i.U } + ) + val sum_to_divide = stats(sum_to_divide_id).sum + val (divider_in, divider_out) = sum_to_divide.divider(stats.head.count).get + + { + // Divider input + val stat = stats(sum_to_divide_id) + + divider_in.valid := (stat.state === get_mean || stat.state === get_variance) && !lanes.io.busy + divider_in.bits := stat.count + } + + { + // Divider output + val waiting_for_divide_id = MuxCase((num_stats-1).U, + stats.zipWithIndex.map { case (s,i) => + (s.state === waiting_for_mean || s.state === waiting_for_variance) -> i.U } + ) + val stat = stats(waiting_for_divide_id) + + divider_out.ready := stat.state === waiting_for_mean || stat.state === waiting_for_variance + + when(stat.state === waiting_for_mean) { + stat.mean := divider_out.bits + }.elsewhen(stat.state === waiting_for_variance) { + stat.inv_stddev := divider_out.bits + } + } + + val variance_to_sqrt_id = MuxCase((num_stats-1).U, + stats.zipWithIndex.map { case (s,i) => + (s.state === get_stddev) -> i.U } + ) + val variance_to_sqrt = stats(variance_to_sqrt_id).inv_stddev + val (sqrt_in, sqrt_out) = variance_to_sqrt.sqrt.get + + { + // Sqrt input + val stat = stats(variance_to_sqrt_id) + + sqrt_in.valid := stat.state === get_stddev + } + + { + // Sqrt output + val waiting_for_sqrt_id = MuxCase((num_stats-1).U, + stats.zipWithIndex.map { case (s,i) => + (s.state === waiting_for_stddev) -> i.U } + ) + val stat = stats(waiting_for_sqrt_id) + + sqrt_out.ready := stat.state === waiting_for_stddev + + // TODO this fallback for stddev === 0 only works if acc_t is an SInt + assert(acc_t.isInstanceOf[SInt]) + + when (stat.state === waiting_for_stddev) { + stat.inv_stddev := Mux(sqrt_out.bits.asUInt() === acc_t.zero.asUInt(), + 1.S(acc_t.getWidth.W).asTypeOf(acc_t), + sqrt_out.bits + ) + } + } + + val stddev_to_inv_id = MuxCase((num_stats-1).U, + stats.zipWithIndex.map { case (s,i) => + (s.state === get_inv_stddev) -> i.U } + ) + val stddev_to_inv = stats(stddev_to_inv_id).inv_stddev + val (reciprocal_in, reciprocal_out) = stddev_to_inv.reciprocal(scale_t).get + + { + // Reciprocal input + val stat = stats(stddev_to_inv_id) + + reciprocal_in.valid := stat.state === get_inv_stddev + reciprocal_in.bits := DontCare + } + + { + // Reciprocal output + val waiting_for_reciprocal_id = MuxCase((num_stats-1).U, + stats.zipWithIndex.map { case (s,i) => + (s.state === waiting_for_inv_stddev) -> i.U } + ) + val stat = stats(waiting_for_reciprocal_id) + + reciprocal_out.ready := stat.state === waiting_for_inv_stddev + + when (stat.state === waiting_for_inv_stddev) { + stat.inv_stddev := reciprocal_out.bits.asTypeOf(stat.inv_stddev) + } + } + + val sum_exp_to_inv_id = MuxCase((num_stats-1).U, + stats.zipWithIndex.map { case (s,i) => + (s.state === get_inv_sum_exp) -> i.U } + ) + val sum_exp_to_inv = stats(sum_exp_to_inv_id).sum + val exp_divider_in = Wire(Decoupled(UInt(0.W))) + val exp_divider_out = Wire(Decoupled(scale_t.cloneType)) + + scale_t match { + case Float(expWidth, sigWidth) => + + exp_divider_in.bits := DontCare + + // We translate our integer to floating-point form so that we can use the hardfloat divider + def in_to_float(x: SInt) = { + val in_to_rec_fn = Module(new INToRecFN(intWidth = sum_exp_to_inv.getWidth, expWidth, sigWidth)) + in_to_rec_fn.io.signedIn := true.B + in_to_rec_fn.io.in := x.asUInt() + in_to_rec_fn.io.roundingMode := consts.round_near_even // consts.round_near_maxMag + in_to_rec_fn.io.detectTininess := consts.tininess_afterRounding + + in_to_rec_fn.io.out + } + + val self_rec = in_to_float(sum_exp_to_inv.asUInt().asSInt()) + val one_rec = in_to_float(127.S) // softmax maximum is 127 for signed int8 + + // Instantiate the hardloat divider + val divider = Module(new DivSqrtRecFN_small(expWidth, sigWidth, 0)) + + exp_divider_in.ready := divider.io.inReady + divider.io.inValid := exp_divider_in.valid + divider.io.sqrtOp := false.B + divider.io.a := one_rec + divider.io.b := self_rec + divider.io.roundingMode := consts.round_near_even + divider.io.detectTininess := consts.tininess_afterRounding + + exp_divider_out.valid := divider.io.outValid_div + exp_divider_out.bits := fNFromRecFN(expWidth, sigWidth, divider.io.out).asTypeOf(scale_t) + } + + + { + // Divider input + val stat = stats(sum_exp_to_inv_id) + + exp_divider_in.valid := (stat.state === get_inv_sum_exp) && !lanes.io.busy + exp_divider_in.bits := sum_exp_to_inv.asUInt() + } + + { + // Divider output + val waiting_for_divide_id = MuxCase((num_stats-1).U, + stats.zipWithIndex.map { case (s,i) => + (s.state === waiting_for_inv_sum_exp) -> i.U } + ) + val stat = stats(waiting_for_divide_id) + + exp_divider_out.ready := stat.state === waiting_for_inv_sum_exp + + when (stat.state === waiting_for_inv_sum_exp) { + stat.inv_sum_exp := exp_divider_out.bits.asTypeOf(stat.inv_sum_exp) + } + } + + // State transitions + for (((stat, next_state), id) <- (stats zip next_states).zipWithIndex) { + val state = stat.state + val cmd = stat.cmd + + val done = done_with_functional_units(id) + + when (state === idle) { + // We have a different "when" statement below to support the case where a new row is input into the normalizer + next_state := idle + done := DontCare + }.elsewhen(state === output) { + next_state := Mux(io.out.fire() && out_stats_id === id.U, idle, state) + done := io.out.fire() && out_stats_id === id.U + }.elsewhen(state === get_max) { + val is_last_lane_input = stat.vec_groups_left === 0.U || + (stat.vec_groups_left === 1.U && + max_lanes.io.ins.bits.stats_id === id.U && + max_lanes.io.ins.fire()) + + next_state := Mux( + is_last_lane_input, + MuxCase(state, Seq( + (cmd === NormCmd.MAX) -> idle, + (cmd === NormCmd.SUM_EXP || cmd === NormCmd.INV_SUM_EXP) -> get_sum + )), + state + ) + + done := is_last_lane_input && cmd === NormCmd.MAX + }.elsewhen(state === get_sum) { + val is_last_lane_input = stat.vec_groups_left === 0.U || + (stat.vec_groups_left === 1.U && + lanes.io.ins.bits.stats_id === id.U && + lanes.io.ins.fire()) + + next_state := Mux( + is_last_lane_input, + MuxCase(state, Seq( + (cmd === NormCmd.SUM || cmd === NormCmd.VARIANCE || cmd === NormCmd.SUM_EXP) -> idle, + (cmd === NormCmd.MEAN) -> get_mean, + (cmd === NormCmd.INV_STDDEV) -> get_variance, + (cmd === NormCmd.INV_SUM_EXP) -> get_inv_sum_exp, + )), + state + ) +// next_state := Mux(cmd === NormCmd.SUM || cmd === NormCmd.VARIANCE, +// Mux(is_last_lane_input, idle, state), +// Mux(is_last_lane_input, +// Mux(cmd === NormCmd.MEAN, get_mean, get_variance), +// state) +// ) + + done := is_last_lane_input && cmd =/= NormCmd.MEAN && cmd =/= NormCmd.INV_STDDEV && cmd =/= NormCmd.INV_SUM_EXP + }.elsewhen(state === get_mean || state === get_variance) { + next_state := Mux(divider_in.fire() && sum_to_divide_id === id.U, state.next, state) + done := false.B + }.elsewhen(state === waiting_for_mean) { + next_state := Mux(divider_out.fire(), idle, state) + done := divider_out.fire() + }.elsewhen(state === waiting_for_variance) { + next_state := Mux(divider_out.fire(), get_stddev, state) + done := false.B + }.elsewhen(state === get_stddev) { + next_state := Mux(sqrt_in.fire() && variance_to_sqrt_id === id.U, state.next, state) + done := false.B + }.elsewhen(state === waiting_for_stddev) { + next_state := Mux(sqrt_out.fire(), state.next, state) + done := false.B + }.elsewhen(state === get_inv_stddev) { + next_state := Mux(reciprocal_in.fire() && stddev_to_inv_id === id.U, state.next, state) + done := false.B + }.elsewhen(state === waiting_for_inv_stddev) { + next_state := Mux(reciprocal_out.fire(), idle, state) + done := reciprocal_out.fire() + }.elsewhen(state === get_inv_sum_exp) { + next_state := Mux(exp_divider_in.fire() && sum_exp_to_inv_id === id.U, state.next, state) + done := false.B + }.elsewhen(state === waiting_for_inv_sum_exp) { + next_state := Mux(exp_divider_out.fire(), idle, state) + done := exp_divider_out.fire() + }.otherwise { + assert(false.B, "invalid state in Normalizer") + next_state := DontCare + done := DontCare + } + + when (io.in.fire() && in_stats_id === id.U) { + next_state := Mux(io.in.bits.cmd === NormCmd.RESET, output, + Mux(io.in.bits.cmd === NormCmd.MAX, get_max, get_sum)) + when (io.in.bits.cmd === NormCmd.SUM_EXP) { + stat.max := stat.running_max + } + } + } + + // Update stats variables + for (((stat, next_state), id) <- (stats zip next_states).zipWithIndex) { + val state = stat.state + + val reset_running_state = + state === output || + (state === get_mean && next_state =/= get_mean) || + (state === get_variance && next_state =/= get_variance) + + val is_input = io.in.fire() && in_stats_id === id.U + + when (is_input) { + stat.req := io.in.bits + stat.count := stat.count + io.in.bits.len + stat.elems_left := io.in.bits.len + } + + when(reset_running_state) { + stat.sum := acc_t.zero + stat.count := Mux(is_input, io.in.bits.len, 0.U) + } + + when (state =/= get_inv_sum_exp && next_state === get_inv_sum_exp) { + stat.running_max := acc_t.minimum + } + } + + dontTouch(stats) + + // Assertions + assert(PopCount(stats.map(s => s.state === waiting_for_mean || s.state === waiting_for_variance)) <= 1.U, "we don't support pipelining the divider/sqrt-unit/inv-unit right now") + assert(PopCount(stats.map(_.state === waiting_for_stddev)) <= 1.U, "we don't support pipelining the divider/sqrt-unit/inv-unit right now") + assert(PopCount(stats.map(_.state === waiting_for_inv_stddev)) <= 1.U, "we don't support pipelining the divider/sqrt-unit/inv-unit right now") + assert(PopCount(stats.map(_.state === output)) <= 1.U, "multiple outputs at same time") + assert(acc_t.getWidth == scale_t.getWidth, "we use the same variable to hold both the variance and the inv-stddev, so we need them to see the width") + + // Resets + when (reset.asBool()) { + stats.foreach(_.state := idle) + stats.foreach(_.sum := acc_t.zero) + stats.foreach(_.max := acc_t.minimum) + stats.foreach(_.running_max := acc_t.minimum) + stats.foreach(_.count := 0.U) + stats.foreach(_.inv_sum_exp := acc_t.zero) + } +} + +object Normalizer { + def apply[T <: Data, U <: Data](is_passthru: Boolean, max_len: Int, num_reduce_lanes: Int, num_stats: Int, + latency: Int, fullDataType: Vec[Vec[T]], scale_t: U)(implicit ev: Arithmetic[T]): + (DecoupledIO[NormalizedInput[T,U]], DecoupledIO[NormalizedOutput[T,U]]) = { + if (is_passthru) { + passthru(max_len = max_len, num_stats = num_stats, fullDataType = fullDataType, scale_t = scale_t) + } else { + gen(max_len = max_len, num_reduce_lanes = num_reduce_lanes, num_stats = num_stats, latency = latency, + fullDataType = fullDataType, scale_t = scale_t) + } + } + + def gen[T <: Data, U <: Data](max_len: Int, num_reduce_lanes: Int, num_stats: Int, latency: Int, + fullDataType: Vec[Vec[T]], scale_t: U)(implicit ev: Arithmetic[T]): (DecoupledIO[NormalizedInput[T,U]], DecoupledIO[NormalizedOutput[T,U]]) = { + val norm_unit_module = Module(new Normalizer(max_len, num_reduce_lanes, num_stats, latency, fullDataType, scale_t)) + (norm_unit_module.io.in, norm_unit_module.io.out) + } + + def passthru[T <: Data, U <: Data](max_len: Int, num_stats: Int, fullDataType: Vec[Vec[T]], scale_t: U) + (implicit ev: Arithmetic[T]): (DecoupledIO[NormalizedInput[T,U]], DecoupledIO[NormalizedOutput[T,U]]) = { + + val norm_unit_passthru_q = Module(new Queue(new NormalizedInput[T,U](max_len, num_stats, fullDataType, scale_t), 2)) + val norm_unit_passthru_out = Wire(Decoupled(new NormalizedOutput(fullDataType, scale_t))) + + norm_unit_passthru_out.valid := norm_unit_passthru_q.io.deq.valid + norm_unit_passthru_out.bits.acc_read_resp := norm_unit_passthru_q.io.deq.bits.acc_read_resp + norm_unit_passthru_out.bits.mean := DontCare + norm_unit_passthru_out.bits.max := DontCare + norm_unit_passthru_out.bits.inv_stddev := DontCare + norm_unit_passthru_out.bits.inv_sum_exp := DontCare + + norm_unit_passthru_q.io.deq.ready := norm_unit_passthru_out.ready + + (norm_unit_passthru_q.io.enq, norm_unit_passthru_out) + } +} diff --git a/src/main/scala/gemmini/ReservationStation.scala b/src/main/scala/gemmini/ReservationStation.scala index 8bb03415..72386aad 100644 --- a/src/main/scala/gemmini/ReservationStation.scala +++ b/src/main/scala/gemmini/ReservationStation.scala @@ -293,9 +293,9 @@ class ReservationStation[T <: Data : Arithmetic, U <: Data, V <: Data](config: G } val is_load = funct === LOAD_CMD || funct === LOAD2_CMD || funct === LOAD3_CMD || (funct === CONFIG_CMD && config_cmd_type === CONFIG_LOAD) - val is_store = funct === STORE_CMD || (funct === CONFIG_CMD && config_cmd_type === CONFIG_STORE) - val is_ex = funct === PRELOAD_CMD || funct_is_compute || (funct === CONFIG_CMD && (config_cmd_type === CONFIG_EX || config_cmd_type === CONFIG_IM2COL)) - val is_im2col = funct === CONFIG_CMD && config_cmd_type === CONFIG_IM2COL // im2col commands are a subset of ex commands, so they still go in the ex queue + val is_ex = funct === PRELOAD_CMD || funct_is_compute || (funct === CONFIG_CMD && config_cmd_type === CONFIG_EX) + val is_store = funct === STORE_CMD || (funct === CONFIG_CMD && (config_cmd_type === CONFIG_STORE || config_cmd_type === CONFIG_NORM)) + val is_norm = funct === CONFIG_CMD && config_cmd_type === CONFIG_NORM // normalization commands are a subset of store commands, so they still go in the store queue new_entry.q := Mux1H(Seq( is_load -> ldq, @@ -364,7 +364,7 @@ class ReservationStation[T <: Data : Arithmetic, U <: Data, V <: Data](config: G } when (io.alloc.fire) { - when (new_entry.is_config && new_entry.q === exq && !is_im2col) { + when (new_entry.is_config && new_entry.q === exq) { a_stride := new_entry.cmd.cmd.rs1(31, 16) // TODO magic numbers // TODO this needs to be kept in sync with ExecuteController.scala c_stride := new_entry.cmd.cmd.rs2(63, 48) // TODO magic numbers // TODO this needs to be kept in sync with ExecuteController.scala val set_only_strides = new_entry.cmd.cmd.rs1(7) // TODO magic numbers @@ -377,7 +377,7 @@ class ReservationStation[T <: Data : Arithmetic, U <: Data, V <: Data](config: G val repeat_pixels = maxOf(new_entry.cmd.cmd.rs1(8 + pixel_repeats_bits - 1, 8), 1.U) // TODO we use a default value of pixel repeats here, for backwards compatibility. However, we should deprecate and remove this default value eventually ld_block_strides(id) := block_stride ld_pixel_repeats(id) := repeat_pixels - 1.U - }.elsewhen(new_entry.is_config && new_entry.q === stq) { + }.elsewhen(new_entry.is_config && new_entry.q === stq && !is_norm) { val pool_stride = new_entry.cmd.cmd.rs1(5, 4) // TODO magic numbers pooling_is_enabled := pool_stride =/= 0.U }.elsewhen(funct === PRELOAD_CMD) { diff --git a/src/main/scala/gemmini/Scratchpad.scala b/src/main/scala/gemmini/Scratchpad.scala index 008dc990..64b66bde 100644 --- a/src/main/scala/gemmini/Scratchpad.scala +++ b/src/main/scala/gemmini/Scratchpad.scala @@ -1,3 +1,4 @@ + package gemmini import chisel3._ @@ -26,13 +27,18 @@ class ScratchpadMemReadRequest[U <: Data](local_addr_t: LocalAddr, scale_t_bits: } -class ScratchpadMemWriteRequest(local_addr_t: LocalAddr, scale_t_bits: Int) +class ScratchpadMemWriteRequest(local_addr_t: LocalAddr, acc_t_bits: Int, scale_t_bits: Int) (implicit p: Parameters) extends CoreBundle { val vaddr = UInt(coreMaxAddrBits.W) val laddr = local_addr_t.cloneType - val acc_act = UInt(2.W) // TODO don't use a magic number for the width here + val acc_act = UInt(Activation.bitwidth.W) // TODO don't use a magic number for the width here val acc_scale = UInt(scale_t_bits.W) + val acc_igelu_qb = UInt(acc_t_bits.W) + val acc_igelu_qc = UInt(acc_t_bits.W) + val acc_iexp_qln2 = UInt(acc_t_bits.W) + val acc_iexp_qln2_inv = UInt(acc_t_bits.W) + val acc_norm_stats_id = UInt(8.W) // TODO magic number val len = UInt(16.W) // TODO don't use a magic number for the width here val block = UInt(8.W) // TODO don't use a magic number for the width here @@ -58,14 +64,12 @@ class ScratchpadMemReadResponse extends Bundle { class ScratchpadReadMemIO[U <: Data](local_addr_t: LocalAddr, scale_t_bits: Int)(implicit p: Parameters) extends CoreBundle { val req = Decoupled(new ScratchpadMemReadRequest(local_addr_t, scale_t_bits)) val resp = Flipped(Valid(new ScratchpadMemReadResponse)) - } -class ScratchpadWriteMemIO(local_addr_t: LocalAddr, scale_t_bits: Int) +class ScratchpadWriteMemIO(local_addr_t: LocalAddr, acc_t_bits: Int, scale_t_bits: Int) (implicit p: Parameters) extends CoreBundle { - val req = Decoupled(new ScratchpadMemWriteRequest(local_addr_t, scale_t_bits)) + val req = Decoupled(new ScratchpadMemWriteRequest(local_addr_t, acc_t_bits, scale_t_bits)) val resp = Flipped(Valid(new ScratchpadMemWriteResponse)) - } class ScratchpadReadReq(val n: Int) extends Bundle { @@ -203,7 +207,7 @@ class Scratchpad[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T, // DMA ports val dma = new Bundle { val read = Flipped(new ScratchpadReadMemIO(local_addr_t, mvin_scale_t_bits)) - val write = Flipped(new ScratchpadWriteMemIO(local_addr_t, acc_scale_t_bits)) + val write = Flipped(new ScratchpadWriteMemIO(local_addr_t, accType.getWidth, acc_scale_t_bits)) } // SRAM ports @@ -215,7 +219,7 @@ class Scratchpad[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T, // Accumulator ports val acc = new Bundle { val read_req = Flipped(Vec(acc_banks, Decoupled(new AccumulatorReadReq( - acc_bank_entries, log2Up(accType.getWidth), acc_scale_t.asInstanceOf[V] + acc_bank_entries, accType, acc_scale_t.asInstanceOf[V] )))) val read_resp = Vec(acc_banks, Decoupled(new AccumulatorScaleResp( Vec(meshColumns, Vec(tileColumns, inputType)), @@ -242,25 +246,37 @@ class Scratchpad[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T, }) val write_dispatch_q = Queue(io.dma.write.req) - write_dispatch_q.ready := false.B - // Write scale queue is necessary to maintain in-order requests to accumulator scale unit + // Write norm/scale queues are necessary to maintain in-order requests to accumulator norm/scale units // Writes from main SPAD just flow directly between scale_q and issue_q, while writes // From acc are ordered - val write_scale_q = Module(new Queue(new ScratchpadMemWriteRequest(local_addr_t, acc_scale_t_bits), spad_read_delay)) - val write_issue_q = Module(new Queue(new ScratchpadMemWriteRequest(local_addr_t, acc_scale_t_bits), spad_read_delay+1, pipe=true)) + val write_norm_q = Module(new Queue(new ScratchpadMemWriteRequest(local_addr_t, accType.getWidth, acc_scale_t_bits), spad_read_delay+2)) + val write_scale_q = Module(new Queue(new ScratchpadMemWriteRequest(local_addr_t, accType.getWidth, acc_scale_t_bits), spad_read_delay+2)) + val write_issue_q = Module(new Queue(new ScratchpadMemWriteRequest(local_addr_t, accType.getWidth, acc_scale_t_bits), spad_read_delay+1, pipe=true)) val read_issue_q = Module(new Queue(new ScratchpadMemReadRequest(local_addr_t, mvin_scale_t_bits), spad_read_delay+1, pipe=true)) // TODO can't this just be a normal queue? + write_dispatch_q.ready := false.B + + write_norm_q.io.enq.valid := false.B + write_norm_q.io.enq.bits := write_dispatch_q.bits + write_norm_q.io.deq.ready := false.B + write_scale_q.io.enq.valid := false.B - write_scale_q.io.enq.bits := write_dispatch_q.bits + write_scale_q.io.enq.bits := write_norm_q.io.deq.bits write_scale_q.io.deq.ready := false.B write_issue_q.io.enq.valid := false.B write_issue_q.io.enq.bits := write_scale_q.io.deq.bits - // Garbage can immediately fire between dispatch_q and scale_q + // Garbage can immediately fire from dispatch_q -> norm_q when (write_dispatch_q.bits.laddr.is_garbage()) { - write_scale_q.io.enq <> write_dispatch_q + write_norm_q.io.enq <> write_dispatch_q } + + // Non-acc or garbage can immediately fire between norm_q and scale_q + when (write_norm_q.io.deq.bits.laddr.is_garbage() || !write_norm_q.io.deq.bits.laddr.is_acc_addr) { + write_scale_q.io.enq <> write_norm_q.io.deq + } + // Non-acc or garbage can immediately fire between scale_q and issue_q when (write_scale_q.io.deq.bits.laddr.is_garbage() || !write_scale_q.io.deq.bits.laddr.is_acc_addr) { write_issue_q.io.enq <> write_scale_q.io.deq @@ -425,7 +441,7 @@ class Scratchpad[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T, writer.module.io.flush := io.flush reader.module.io.flush := io.flush - io.busy := writer.module.io.busy || reader.module.io.busy || write_issue_q.io.deq.valid || write_scale_q.io.deq.valid || write_dispatch_q.valid + io.busy := writer.module.io.busy || reader.module.io.busy || write_issue_q.io.deq.valid || write_norm_q.io.deq.valid || write_scale_q.io.deq.valid || write_dispatch_q.valid val spad_mems = { val banks = Seq.fill(sp_banks) { Module(new ScratchpadBank( @@ -444,7 +460,7 @@ class Scratchpad[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T, val exread = ex_read_req.valid // TODO we tie the write dispatch queue's, and write issue queue's, ready and valid signals together here - val dmawrite = write_dispatch_q.valid && write_scale_q.io.enq.ready && + val dmawrite = write_dispatch_q.valid && write_norm_q.io.enq.ready && !write_dispatch_q.bits.laddr.is_garbage() && !(bio.write.en && config.sp_singleported.B) && !write_dispatch_q.bits.laddr.is_acc_addr && write_dispatch_q.bits.laddr.sp_bank() === i.U @@ -462,7 +478,7 @@ class Scratchpad[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T, when (bio.read.req.fire) { write_dispatch_q.ready := true.B - write_scale_q.io.enq.valid := true.B + write_norm_q.io.enq.valid := true.B io.dma.write.resp.valid := true.B } @@ -543,34 +559,73 @@ class Scratchpad[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T, val acc_row_t = Vec(meshColumns, Vec(tileColumns, accType)) val spad_row_t = Vec(meshColumns, Vec(tileColumns, inputType)) +// val acc_norm_unit = Module(new Normalizer( +// max_len = block_cols, +// num_reduce_lanes = -1, +// num_stats = 4, +// latency = 4, +// fullDataType = acc_row_t, +// scale_t = acc_scale_t, +// )) + + val (acc_norm_unit_in, acc_norm_unit_out) = Normalizer( + is_passthru = !config.has_normalizations, + max_len = block_cols, + num_reduce_lanes = -1, + num_stats = 4, + latency = 4, + fullDataType = acc_row_t, + scale_t = acc_scale_t, + ) + + acc_norm_unit_in.valid := false.B + acc_norm_unit_in.bits.len := write_norm_q.io.deq.bits.len + acc_norm_unit_in.bits.stats_id := write_norm_q.io.deq.bits.acc_norm_stats_id + acc_norm_unit_in.bits.cmd := write_norm_q.io.deq.bits.laddr.norm_cmd + acc_norm_unit_in.bits.acc_read_resp := DontCare + val acc_scale_unit = Module(new AccumulatorScale( acc_row_t, spad_row_t, acc_scale_t.asInstanceOf[V], - log2Up(accType.getWidth), acc_read_small_width, acc_read_full_width, acc_scale_func, acc_scale_num_units, acc_scale_latency, has_nonlinear_activations, + has_normalizations, )) - acc_scale_unit.io.in.valid := false.B - acc_scale_unit.io.in.bits := DontCare - val dma_resp_ready = ( - writer.module.io.req.ready && - write_issue_q.io.deq.bits.laddr.is_acc_addr && - !write_issue_q.io.deq.bits.laddr.is_garbage() - ) + val acc_waiting_to_be_scaled = write_scale_q.io.deq.valid && + !write_scale_q.io.deq.bits.laddr.is_garbage() && + write_scale_q.io.deq.bits.laddr.is_acc_addr && + write_issue_q.io.enq.ready + + acc_norm_unit_out.ready := acc_scale_unit.io.in.ready && acc_waiting_to_be_scaled + acc_scale_unit.io.in.valid := acc_norm_unit_out.valid && acc_waiting_to_be_scaled + acc_scale_unit.io.in.bits := acc_norm_unit_out.bits + + when (acc_scale_unit.io.in.fire()) { + write_issue_q.io.enq <> write_scale_q.io.deq + } + acc_scale_unit.io.out.ready := false.B + + val dma_resp_ready = + writer.module.io.req.ready && + write_issue_q.io.deq.bits.laddr.is_acc_addr && + !write_issue_q.io.deq.bits.laddr.is_garbage() + when (acc_scale_unit.io.out.bits.fromDMA && dma_resp_ready) { + // Send the acc-scale result into the DMA acc_scale_unit.io.out.ready := true.B writeData.valid := acc_scale_unit.io.out.valid writeData.bits := acc_scale_unit.io.out.bits.data.asUInt fullAccWriteData := acc_scale_unit.io.out.bits.full_data.asUInt } for (i <- 0 until acc_banks) { + // Send the acc-sccale result to the ExController io.acc.read_resp(i).valid := false.B io.acc.read_resp(i).bits := acc_scale_unit.io.out.bits when (!acc_scale_unit.io.out.bits.fromDMA && acc_scale_unit.io.out.bits.acc_bank_id === i.U) { @@ -608,18 +663,21 @@ class Scratchpad[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T, val exread = ex_read_req.valid // TODO we tie the write dispatch queue's, and write issue queue's, ready and valid signals together here - val dmawrite = write_dispatch_q.valid && write_scale_q.io.enq.ready && + val dmawrite = write_dispatch_q.valid && write_norm_q.io.enq.ready && !write_dispatch_q.bits.laddr.is_garbage() && write_dispatch_q.bits.laddr.is_acc_addr && write_dispatch_q.bits.laddr.acc_bank() === i.U bio.read.req.valid := exread || dmawrite - bio.read.req.bits.relu6_shift := ex_read_req.bits.relu6_shift ex_read_req.ready := bio.read.req.ready // The ExecuteController gets priority when reading from accumulator banks when (exread) { bio.read.req.bits.addr := ex_read_req.bits.addr bio.read.req.bits.act := ex_read_req.bits.act + bio.read.req.bits.igelu_qb := ex_read_req.bits.igelu_qb + bio.read.req.bits.igelu_qc := ex_read_req.bits.igelu_qc + bio.read.req.bits.iexp_qln2 := ex_read_req.bits.iexp_qln2 + bio.read.req.bits.iexp_qln2_inv := ex_read_req.bits.iexp_qln2_inv bio.read.req.bits.scale := ex_read_req.bits.scale bio.read.req.bits.full := false.B bio.read.req.bits.fromDMA := false.B @@ -627,12 +685,16 @@ class Scratchpad[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T, bio.read.req.bits.addr := write_dispatch_q.bits.laddr.acc_row() bio.read.req.bits.full := write_dispatch_q.bits.laddr.read_full_acc_row bio.read.req.bits.act := write_dispatch_q.bits.acc_act + bio.read.req.bits.igelu_qb := write_dispatch_q.bits.acc_igelu_qb.asTypeOf(bio.read.req.bits.igelu_qb) + bio.read.req.bits.igelu_qc := write_dispatch_q.bits.acc_igelu_qc.asTypeOf(bio.read.req.bits.igelu_qc) + bio.read.req.bits.iexp_qln2 := write_dispatch_q.bits.acc_iexp_qln2.asTypeOf(bio.read.req.bits.iexp_qln2) + bio.read.req.bits.iexp_qln2_inv := write_dispatch_q.bits.acc_iexp_qln2_inv.asTypeOf(bio.read.req.bits.iexp_qln2_inv) bio.read.req.bits.scale := write_dispatch_q.bits.acc_scale.asTypeOf(bio.read.req.bits.scale) bio.read.req.bits.fromDMA := true.B when (bio.read.req.fire) { write_dispatch_q.ready := true.B - write_scale_q.io.enq.valid := true.B + write_norm_q.io.enq.valid := true.B io.dma.write.resp.valid := true.B } @@ -641,22 +703,24 @@ class Scratchpad[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T, } bio.read.resp.ready := false.B - when (write_scale_q.io.deq.valid && - acc_scale_unit.io.in.ready && - bio.read.resp.valid && - write_issue_q.io.enq.ready && - write_scale_q.io.deq.bits.laddr.is_acc_addr && - !write_scale_q.io.deq.bits.laddr.is_garbage() && - write_scale_q.io.deq.bits.laddr.acc_bank() === i.U) { - write_scale_q.io.deq.ready := true.B - acc_scale_unit.io.in.valid := true.B + when (write_norm_q.io.deq.valid && + acc_norm_unit_in.ready && + bio.read.resp.valid && + write_scale_q.io.enq.ready && + write_norm_q.io.deq.bits.laddr.is_acc_addr && + !write_norm_q.io.deq.bits.laddr.is_garbage() && + write_norm_q.io.deq.bits.laddr.acc_bank() === i.U) + { + write_norm_q.io.deq.ready := true.B + acc_norm_unit_in.valid := true.B bio.read.resp.ready := true.B - write_issue_q.io.enq.valid := true.B - acc_scale_unit.io.in.bits := bio.read.resp.bits - acc_scale_unit.io.in.bits.acc_bank_id := i.U - } + // Some normalizer commands don't write to main memory, so they don't need to be passed on to the scaling units + write_scale_q.io.enq.valid := NormCmd.writes_to_main_memory(write_norm_q.io.deq.bits.laddr.norm_cmd) + acc_norm_unit_in.bits.acc_read_resp := bio.read.resp.bits + acc_norm_unit_in.bits.acc_read_resp.acc_bank_id := i.U + } } // Writing to the accumulator banks @@ -682,7 +746,6 @@ class Scratchpad[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T, // We need to make sure that we don't try to return a dma read resp from both mvin_scale and mvin_scale_acc // at the same time. mvin_scale always gets priority in this cases - // val spad_last = mvin_scale_out.valid && mvin_scale_out.bits.last && !mvin_scale_out.bits.tag.is_acc val spad_last = mvin_scale_pixel_repeater.io.resp.valid && mvin_scale_pixel_repeater.io.resp.bits.last && !mvin_scale_pixel_repeater.io.resp.bits.tag.is_acc val dmaread = (from_mvin_scale || from_mvin_scale_acc) && diff --git a/src/main/scala/gemmini/StoreController.scala b/src/main/scala/gemmini/StoreController.scala index 692a8e04..45ec459c 100644 --- a/src/main/scala/gemmini/StoreController.scala +++ b/src/main/scala/gemmini/StoreController.scala @@ -11,14 +11,14 @@ import midas.targetutils.PerfCounter // TODO this is almost a complete copy of LoadController. We should combine them into one class // TODO deal with errors when reading scratchpad responses -class StoreController[T <: Data : Arithmetic, U <: Data, V <: Data](config: GemminiArrayConfig[T, U, V], coreMaxAddrBits: Int, local_addr_t: LocalAddr) - (implicit p: Parameters) extends Module { +class StoreController[T <: Data : Arithmetic, U <: Data, V <: Data](config: GemminiArrayConfig[T, U, V], + coreMaxAddrBits: Int, local_addr_t: LocalAddr)(implicit p: Parameters) extends Module { import config._ val io = IO(new Bundle { val cmd = Flipped(Decoupled(new GemminiCmd(reservation_station_entries))) - val dma = new ScratchpadWriteMemIO(local_addr_t, acc_scale_t_bits) + val dma = new ScratchpadWriteMemIO(local_addr_t, accType.getWidth, acc_scale_t_bits) val completed = Decoupled(UInt(log2Up(reservation_station_entries).W)) @@ -42,7 +42,12 @@ class StoreController[T <: Data : Arithmetic, U <: Data, V <: Data](config: Gemm val block_cols = meshColumns * tileColumns val max_blocks = (dma_maxbytes / (block_cols * inputType.getWidth / 8)) max 1 - val activation = Reg(UInt(GemminiISA.CONFIG_MVOUT_RS1_ACTIVATION_WIDTH.W)) + val activation = Reg(UInt(Activation.bitwidth.W)) // TODO magic number + val igelu_qb = Reg(accType) + val igelu_qc = Reg(accType) + val iexp_qln2 = Reg(accType) + val iexp_qln2_inv = Reg(accType) + val norm_stats_id = Reg(UInt(8.W)) // TODO magic number val acc_scale = Reg(acc_scale_t) //val row_counter = RegInit(0.U(log2Ceil(block_rows).W)) @@ -87,6 +92,7 @@ class StoreController[T <: Data : Arithmetic, U <: Data, V <: Data](config: Gemm val config_mvout_rs1 = cmd.bits.cmd.rs1.asTypeOf(new ConfigMvoutRs1) val config_mvout_rs2 = cmd.bits.cmd.rs2.asTypeOf(new ConfigMvoutRs2(acc_scale_t_bits, 32)) + val config_cmd_type = config_mvout_rs1.cmd_type val config_stride = config_mvout_rs2.stride val config_activation = config_mvout_rs1.activation val config_acc_scale = config_mvout_rs2.acc_scale @@ -100,10 +106,22 @@ class StoreController[T <: Data : Arithmetic, U <: Data, V <: Data](config: Gemm val config_upad = config_mvout_rs1.upad val config_lpad = config_mvout_rs1.lpad + val config_norm_rs1 = cmd.bits.cmd.rs1.asTypeOf(new ConfigNormRs1(accType.getWidth)) + val config_norm_rs2 = cmd.bits.cmd.rs2.asTypeOf(new ConfigNormRs2(accType.getWidth)) + val config_stats_id = config_norm_rs1.norm_stats_id + val config_activation_msb = config_norm_rs1.act_msb + val config_set_stats_id_only = config_norm_rs1.set_stats_id_only + val config_iexp_q_const_type = config_norm_rs1.q_const_type + val config_iexp_q_const = config_norm_rs1.q_const + val config_igelu_qb = config_norm_rs2.qb + val config_igelu_qc = config_norm_rs2.qc + + assert(config_norm_rs1.cmd_type === config_mvout_rs1.cmd_type) + val mstatus = cmd.bits.cmd.status val current_vaddr = vaddr + row_counter * stride - val current_localaddr = localaddr + (block_counter * block_stride + row_counter) + val current_localaddr = WireInit(localaddr + (block_counter * block_stride + row_counter)) val pool_row_addr = localaddr + (orow * pool_ocols +& ocol) when (orow_is_negative || ocol_is_negative || orow >= pool_orows || ocol >= pool_ocols) { @@ -112,8 +130,9 @@ class StoreController[T <: Data : Arithmetic, U <: Data, V <: Data](config: Gemm val pool_vaddr = vaddr + (porow_counter * pool_out_dim + pocol_counter) * stride // TODO get rid of these multiplications - val DoConfig = cmd.bits.cmd.inst.funct === CONFIG_CMD - val DoStore = !DoConfig // TODO change this if more commands are added + val DoConfig = cmd.bits.cmd.inst.funct === CONFIG_CMD && config_cmd_type === CONFIG_STORE + val DoConfigNorm = config.has_normalizations.B && cmd.bits.cmd.inst.funct === CONFIG_CMD && config_cmd_type === CONFIG_NORM + val DoStore = !DoConfig && !DoConfigNorm cmd.ready := false.B @@ -140,8 +159,15 @@ class StoreController[T <: Data : Arithmetic, U <: Data, V <: Data](config: Gemm io.dma.req.bits.vaddr := Mux(pooling_is_enabled || mvout_1d_enabled, pool_vaddr, current_vaddr) io.dma.req.bits.laddr := Mux(pooling_is_enabled, pool_row_addr, current_localaddr) //Todo: laddr for 1D? + io.dma.req.bits.laddr.norm_cmd := Mux(block_counter === blocks - 1.U, current_localaddr.norm_cmd, + NormCmd.non_reset_version(current_localaddr.norm_cmd)) io.dma.req.bits.acc_act := activation + io.dma.req.bits.acc_igelu_qb := igelu_qb.asTypeOf(io.dma.req.bits.acc_igelu_qb) + io.dma.req.bits.acc_igelu_qc := igelu_qc.asTypeOf(io.dma.req.bits.acc_igelu_qc) + io.dma.req.bits.acc_iexp_qln2 := iexp_qln2.asTypeOf(io.dma.req.bits.acc_iexp_qln2) + io.dma.req.bits.acc_iexp_qln2_inv := iexp_qln2_inv.asTypeOf(io.dma.req.bits.acc_iexp_qln2_inv) + io.dma.req.bits.acc_norm_stats_id := norm_stats_id io.dma.req.bits.acc_scale := acc_scale.asTypeOf(io.dma.req.bits.acc_scale) io.dma.req.bits.len := Mux(block_counter === blocks - 1.U, ((cols - 1.U) % block_cols.U) + 1.U, block_cols.U) @@ -221,10 +247,24 @@ class StoreController[T <: Data : Arithmetic, U <: Data, V <: Data](config: Gemm } cmd.ready := true.B } - .elsewhen(DoStore && cmd_tracker.io.alloc.fire()) { - val next_state = Mux(pooling_is_enabled, pooling, sending_rows) - control_state := Mux(io.dma.req.fire, next_state, waiting_for_dma_req_ready) + .elsewhen(config.has_normalizations.B && DoConfigNorm) { + when (!config_set_stats_id_only.asBool()) { + igelu_qb := config_igelu_qb.asTypeOf(igelu_qb) + igelu_qc := config_igelu_qc.asTypeOf(igelu_qc) + when(config_iexp_q_const_type === 0.U) { + iexp_qln2 := config_iexp_q_const.asTypeOf(iexp_qln2) + }.elsewhen(config_iexp_q_const_type === 1.U) { + iexp_qln2_inv := config_iexp_q_const.asTypeOf(iexp_qln2_inv) + } + activation := Cat(config_activation_msb, activation(1, 0)) // TODO: magic number } + norm_stats_id := config_stats_id + cmd.ready := true.B + } + .elsewhen(DoStore && cmd_tracker.io.alloc.fire()) { + val next_state = Mux(pooling_is_enabled, pooling, sending_rows) + control_state := Mux(io.dma.req.fire, next_state, waiting_for_dma_req_ready) + } } } @@ -260,6 +300,17 @@ class StoreController[T <: Data : Arithmetic, U <: Data, V <: Data](config: Gemm } } + // Optimizations when features are disabled + if (!config.has_normalizations) { + current_localaddr.norm_cmd := NormCmd.RESET + + igelu_qb := DontCare + igelu_qc := DontCare + iexp_qln2 := DontCare + iexp_qln2_inv := DontCare + norm_stats_id := 0.U + } + // Performance counter CounterEventIO.init(io.counter) io.counter.connectEventSignal(CounterEvent.STORE_ACTIVE_CYCLE, control_state === sending_rows || control_state === pooling) From ccd18b0396ad155c463248a70b58a7ff0eda9076 Mon Sep 17 00:00:00 2001 From: Hasan Genc Date: Thu, 25 Aug 2022 15:21:36 -0700 Subject: [PATCH 04/64] Add parellelized testing infrastructure (#238) Also, fix MIDAS builds --- CHIPYARD.hash | 2 +- README.md | 14 +++++-- scripts/build-midas.sh | 2 + scripts/run-midas.sh | 3 +- software/gemmini-rocc-tests | 2 +- src/main/scala/gemmini/BeatMerger.scala | 11 ++++-- src/main/scala/gemmini/DMA.scala | 19 +++++---- src/main/scala/gemmini/LoadController.scala | 2 +- src/main/scala/gemmini/LoopConv.scala | 39 +++++++++++-------- src/main/scala/gemmini/LoopMatmul.scala | 1 + .../scala/gemmini/ReservationStation.scala | 7 ++-- src/main/scala/gemmini/StoreController.scala | 2 +- src/main/scala/gemmini/ZeroWriter.scala | 3 +- 13 files changed, 66 insertions(+), 41 deletions(-) diff --git a/CHIPYARD.hash b/CHIPYARD.hash index f41949c3..b154a058 100644 --- a/CHIPYARD.hash +++ b/CHIPYARD.hash @@ -1 +1 @@ -117624d8eea27bafd613eec09e9b9b3e31239e08 +481398b910fa95ec88dd578c67ba358a4d83129d diff --git a/README.md b/README.md index b6b73b1c..77d6466d 100644 --- a/README.md +++ b/README.md @@ -32,20 +32,28 @@ Run these steps to install Chipyard and Spike (make sure to checkout the correct ```shell git clone https://github.com/ucb-bar/chipyard.git cd chipyard -git checkout 117624d8eea27bafd613eec09e9b9b3e31239e08 +git checkout 481398b910fa95ec88dd578c67ba358a4d83129d ./scripts/init-submodules-no-riscv-tools.sh ./scripts/build-toolchains.sh esp-tools source env.sh cd generators/gemmini -git fetch && git checkout dev && git pull origin dev +git config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*" +git fetch --unshallow && git checkout dev && git pull origin dev git submodule update cd - cd toolchains/esp-tools/riscv-isa-sim/build -git fetch && git checkout 090e82c473fd28b4eb2011ffcd771ead6076faab +git fetch --unshallow && git checkout 2ed403a70f65559a3c2a06bf724d4737edc73a23 make && make install + +# The final step is only necessary if you want to run MIDAS simulations with +# realistic DRAM models +cd - +cd sims/firesim +git fetch --tags && git checkout 1.13.6 +./build-setup.sh --library --skip-validate ``` Setting Up Gemmini diff --git a/scripts/build-midas.sh b/scripts/build-midas.sh index c966513c..590ced9d 100755 --- a/scripts/build-midas.sh +++ b/scripts/build-midas.sh @@ -53,6 +53,8 @@ if [ dram_model == "" ]; then echo DRAM model must be provided. fi +export SYSLIBS=" $SYSLIBS -l:libdwarf.so -l:libelf.so -lz -lgmp " + cd ../../sims/firesim/ source sourceme-f1-manager.sh &> build.log diff --git a/scripts/run-midas.sh b/scripts/run-midas.sh index 9bae1813..806b21b1 100755 --- a/scripts/run-midas.sh +++ b/scripts/run-midas.sh @@ -121,5 +121,6 @@ if [ ! -f ./${simulator}${DEBUG} ]; then fi ./${simulator}${DEBUG} ${PK} ${full_binary_path} ${waveform_flag} \ - +vcs+initreg+0 +vcs+initmem+0 +fesvr-step-size=128 +mm_relaxFunctionalModel_0=0 +mm_openPagePolicy_0=1 +mm_backendLatency_0=2 +mm_schedulerWindowSize_0=8 +mm_transactionQueueDepth_0=8 +mm_dramTimings_tAL_0=0 +mm_dramTimings_tCAS_0=14 +mm_dramTimings_tCMD_0=1 +mm_dramTimings_tCWD_0=10 +mm_dramTimings_tCCD_0=4 +mm_dramTimings_tFAW_0=25 +mm_dramTimings_tRAS_0=33 +mm_dramTimings_tREFI_0=7800 +mm_dramTimings_tRC_0=47 +mm_dramTimings_tRCD_0=14 +mm_dramTimings_tRFC_0=160 +mm_dramTimings_tRRD_0=8 +mm_dramTimings_tRP_0=14 +mm_dramTimings_tRTP_0=8 +mm_dramTimings_tRTRS_0=2 +mm_dramTimings_tWR_0=15 +mm_dramTimings_tWTR_0=8 +mm_rowAddr_offset_0=18 +mm_rowAddr_mask_0=65535 +mm_rankAddr_offset_0=16 +mm_rankAddr_mask_0=3 +mm_bankAddr_offset_0=13 +mm_bankAddr_mask_0=7 +mm_llc_wayBits_0=3 +mm_llc_setBits_0=12 +mm_llc_blockBits_0=7 +mm_llc_activeMSHRs_0=8 +shmemportname0=0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 +macaddr0=00:00:00:00:00:02 +niclog0=niclog0 +linklatency0=6405 +netbw0=100 +netburst0=8 +nic-loopback0 +tracefile=TRACEFILE +blkdev-in-mem0=128 +blkdev-log0=blkdev-log0 +autocounter-readrate=1000 +autocounter-filename=AUTOCOUNTERFILE +dramsim +max-cycles=100000000 \ + +vcs+initreg+0 +vcs+initmem+0 +fesvr-step-size=128 +mm_relaxFunctionalModel_0=0 +mm_openPagePolicy_0=1 +mm_backendLatency_0=2 +mm_schedulerWindowSize_0=8 +mm_transactionQueueDepth_0=8 +mm_dramTimings_tAL_0=0 +mm_dramTimings_tCAS_0=14 +mm_dramTimings_tCMD_0=1 +mm_dramTimings_tCWD_0=10 +mm_dramTimings_tCCD_0=4 +mm_dramTimings_tFAW_0=25 +mm_dramTimings_tRAS_0=33 +mm_dramTimings_tREFI_0=7800 +mm_dramTimings_tRC_0=47 +mm_dramTimings_tRCD_0=14 +mm_dramTimings_tRFC_0=160 +mm_dramTimings_tRRD_0=8 +mm_dramTimings_tRP_0=14 +mm_dramTimings_tRTP_0=8 +mm_dramTimings_tRTRS_0=2 +mm_dramTimings_tWR_0=15 +mm_dramTimings_tWTR_0=8 +mm_rowAddr_offset_0=18 +mm_rowAddr_mask_0=65535 +mm_rankAddr_offset_0=16 +mm_rankAddr_mask_0=3 +mm_bankAddr_offset_0=13 +mm_bankAddr_mask_0=7 +mm_llc_wayBits_0=3 +mm_llc_setBits_0=12 +mm_llc_blockBits_0=7 +mm_llc_activeMSHRs_0=8 +shmemportname0=0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 +macaddr0=00:00:00:00:00:02 +niclog0=niclog0 +linklatency0=6405 +netbw0=100 +netburst0=8 +nic-loopback0 +tracefile=TRACEFILE +blkdev-in-mem0=128 +blkdev-log0=blkdev-log0 +autocounter-readrate=1000 +autocounter-filename=AUTOCOUNTERFILE +max-cycles=100000000 \ + +dramsim +dramsim_ini_dir=/home/eecs/hngenc/chip/generators/testchipip/src/main/resources/dramsim2_ini \ 2>/dev/null diff --git a/software/gemmini-rocc-tests b/software/gemmini-rocc-tests index 37464740..70f8e58b 160000 --- a/software/gemmini-rocc-tests +++ b/software/gemmini-rocc-tests @@ -1 +1 @@ -Subproject commit 374647403d5e73543463f9f66f730aa16bc8e362 +Subproject commit 70f8e58b8113b288f0937a4777fe582437ea36ce diff --git a/src/main/scala/gemmini/BeatMerger.scala b/src/main/scala/gemmini/BeatMerger.scala index a845327b..a6a67dab 100644 --- a/src/main/scala/gemmini/BeatMerger.scala +++ b/src/main/scala/gemmini/BeatMerger.scala @@ -1,3 +1,4 @@ + package gemmini import chisel3._ @@ -26,7 +27,8 @@ class BeatMergerOut(val spadWidth: Int, val accWidth: Int, val spadRows: Int, va maxReqBytes: in bytes aligned_to: in bytes */ -class BeatMerger[U <: Data](beatBits: Int, maxShift: Int, spadWidth: Int, accWidth: Int, spadRows: Int, accRows: Int, maxReqBytes: Int, alignedTo: Int, meshRows: Int, mvin_scale_t_bits: Int, nCmds: Int) +class BeatMerger[U <: Data](beatBits: Int, maxShift: Int, spadWidth: Int, accWidth: Int, spadRows: Int, accRows: Int, + maxReqBytes: Int, alignedTo: Int, meshRows: Int, mvin_scale_t_bits: Int, nCmds: Int) extends Module { val io = IO(new Bundle { val req = Flipped(Decoupled(new XactTrackerEntry(maxShift, spadWidth, accWidth, spadRows, accRows, maxReqBytes, mvin_scale_t_bits, nCmds))) @@ -75,9 +77,10 @@ class BeatMerger[U <: Data](beatBits: Int, maxShift: Int, spadWidth: Int, accWid val total_bytes_sent = req.bits.spad_row_offset + bytesSent Mux(req.bits.has_acc_bitwidth, // We only add "if" statements here to satisfy the Verilator linter. The code would be cleaner without the - // "if" condition and the "else" clause - if (total_bytes_sent.getWidth >= log2Up(accWidthBytes+1)) total_bytes_sent / accWidthBytes.U else 0.U, - if (total_bytes_sent.getWidth >= log2Up(spadWidthBytes+1)) total_bytes_sent / spadWidthBytes.U else 0.U) + // "if" condition and the "else" clause. Similarly, the width expansions are also there to satisfy the Verilator + // linter, despite making the code uglier. + if (total_bytes_sent.getWidth >= log2Up(accWidthBytes + 1)) total_bytes_sent / accWidthBytes.U(total_bytes_sent.getWidth.W) else 0.U, + if (total_bytes_sent.getWidth >= log2Up(spadWidthBytes + 1)) total_bytes_sent / spadWidthBytes.U(total_bytes_sent.getWidth.W) else 0.U) } io.out.bits.is_acc := req.bits.is_acc diff --git a/src/main/scala/gemmini/DMA.scala b/src/main/scala/gemmini/DMA.scala index 9761228f..41164f30 100644 --- a/src/main/scala/gemmini/DMA.scala +++ b/src/main/scala/gemmini/DMA.scala @@ -1,3 +1,4 @@ + package gemmini import chisel3._ @@ -120,10 +121,11 @@ class StreamReadBeat (val nXacts: Int, val beatBits: Int, val maxReqBytes: Int) } // TODO StreamReaderCore and StreamWriter are actually very alike. Is there some parent class they could both inherit from? -class StreamReaderCore[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T, U, V], nXacts: Int, beatBits: Int, maxBytes: Int, - spadWidth: Int, accWidth: Int, aligned_to: Int, - spad_rows: Int, acc_rows: Int, meshRows: Int, use_tlb_register_filter: Boolean, - use_firesim_simulation_counters: Boolean) +class StreamReaderCore[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T, U, V], nXacts: Int, beatBits: Int, + maxBytes: Int, spadWidth: Int, accWidth: Int, aligned_to: Int, + spad_rows: Int, acc_rows: Int, meshRows: Int, + use_tlb_register_filter: Boolean, + use_firesim_simulation_counters: Boolean) (implicit p: Parameters) extends LazyModule { val node = TLHelper.makeClientNode( name = "stream-reader", sourceId = IdRange(0, nXacts)) @@ -263,9 +265,10 @@ class StreamReaderCore[T <: Data, U <: Data, V <: Data](config: GemminiArrayConf io.reserve.entry.addr := req.spaddr + req.block_stride * Mux(req.has_acc_bitwidth, // We only add "if" statements here to satisfy the Verilator linter. The code would be cleaner without the - // "if" condition and the "else" clause - if (bytesRequested.getWidth >= log2Up(accWidthBytes+1)) bytesRequested / accWidthBytes.U else 0.U, - if (bytesRequested.getWidth >= log2Up(spadWidthBytes+1)) bytesRequested / spadWidthBytes.U else 0.U) + // "if" condition and the "else" clause. Similarly, the width expansions are also there to satisfy the Verilator + // linter, despite making the code uglier. + if (bytesRequested.getWidth >= log2Up(accWidthBytes+1)) bytesRequested / accWidthBytes.U(bytesRequested.getWidth.W) else 0.U, + if (bytesRequested.getWidth >= log2Up(spadWidthBytes+1)) bytesRequested / spadWidthBytes.U(bytesRequested.getWidth.W) else 0.U) io.reserve.entry.spad_row_offset := Mux(req.has_acc_bitwidth, bytesRequested % accWidthBytes.U, bytesRequested % spadWidthBytes.U) when (untranslated_a.fire) { @@ -408,7 +411,7 @@ class StreamWriter[T <: Data: Arithmetic](nXacts: Int, beatBits: Int, maxBytes: val bytes_written = UInt(log2Up(maxBytes+1).W) val bytes_written_per_beat = Vec(maxBeatsPerReq, UInt(log2Up(beatBytes+1).W)) - def total_beats(dummy: Int = 0) = Mux(size < beatBytes.U, 1.U, size / beatBytes.U) + def total_beats(dummy: Int = 0) = Mux(size < beatBytes.U, 1.U, size / beatBytes.U(size.getWidth.W)) // The width expansion is added here solely to satsify Verilator's linter } val smallest_write_size = aligned_to max beatBytes diff --git a/src/main/scala/gemmini/LoadController.scala b/src/main/scala/gemmini/LoadController.scala index db69857a..71ecf7c7 100644 --- a/src/main/scala/gemmini/LoadController.scala +++ b/src/main/scala/gemmini/LoadController.scala @@ -114,7 +114,7 @@ class LoadController[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig cmd_tracker.io.alloc.valid := control_state === waiting_for_command && cmd.valid && DoLoad cmd_tracker.io.alloc.bits.bytes_to_read := Mux(io.dma.req.bits.has_acc_bitwidth, cols * actual_rows_read * config.accType.getWidth.U, - cols * actual_rows_read * config.inputType.getWidth.U) / 8.U + cols * actual_rows_read * config.inputType.getWidth.U) >> 3 // We replaced a very clear "/ 8.U" operation here with a ">> 3" operation, solely to satisfy Verilator's linter cmd_tracker.io.alloc.bits.tag.rob_id := cmd.bits.rob_id.bits cmd_tracker.io.request_returned.valid := io.dma.resp.fire // TODO use a bundle connect cmd_tracker.io.request_returned.bits.cmd_id := io.dma.resp.bits.cmd_id // TODO use a bundle connect diff --git a/src/main/scala/gemmini/LoopConv.scala b/src/main/scala/gemmini/LoopConv.scala index 3fe28e98..210bcade 100644 --- a/src/main/scala/gemmini/LoopConv.scala +++ b/src/main/scala/gemmini/LoopConv.scala @@ -1,3 +1,4 @@ + package gemmini import chisel3._ @@ -115,7 +116,7 @@ class LoopConvLdBias(block_size: Int, coreMaxAddrBits: Int, large_iterator_bitwi // Addresses val dram_offset = och * (acc_w/8).U val dram_addr = Mux(req.no_bias, 0.U, req.dram_addr + LoopConv.castDramOffset(dram_offset)) - val spad_addr = acc_addr_start +& (och / block_size.U) * batches * orows * ocols +& b * orows * ocols +& orow * ocols +& ocol + val spad_addr = acc_addr_start +& (och / block_size.U(och.getWidth.W)) * batches * orows * ocols +& b * orows * ocols +& orow * ocols +& ocol // Sizes val I = Mux(ocols - ocol > block_size.U, block_size.U, ocols - ocol) @@ -225,9 +226,10 @@ class LoopConvLdInputReq(val coreMaxAddrBits: Int, val large_iterator_bitwidth: val loop_id = UInt(log2Up(concurrent_loops).W) } -class LoopConvLdInput(block_size: Int, coreMaxAddrBits: Int, large_iterator_bitwidth: Int, small_iterator_bitwidth: Int, tiny_iterator_bitwidth: Int, max_addr: Int, input_w: Int, - max_block_len: Int, concurrent_loops: Int, latency: Int, - config_mvin_rs1_t: ConfigMvinRs1, mvin_rs2_t: MvinRs2)(implicit p: Parameters) extends Module { +class LoopConvLdInput(block_size: Int, coreMaxAddrBits: Int, large_iterator_bitwidth: Int, small_iterator_bitwidth: Int, + tiny_iterator_bitwidth: Int, max_addr: Int, input_w: Int, max_block_len: Int, + concurrent_loops: Int, latency: Int, config_mvin_rs1_t: ConfigMvinRs1, mvin_rs2_t: MvinRs2) + (implicit p: Parameters) extends Module { val MVIN_SCALE_IDENTITY = 0x3f800000.U // TODO get this from configs somehow val io = IO(new Bundle { @@ -401,9 +403,10 @@ class LoopConvLdWeightReq(val coreMaxAddrBits: Int, val large_iterator_bitwidth: val loop_id = UInt(log2Up(concurrent_loops).W) } -class LoopConvLdWeight(block_size: Int, coreMaxAddrBits: Int, large_iterator_bitwidth: Int, small_iterator_bitwidth: Int, tiny_iterator_bitwidth: Int, max_addr: Int, input_w: Int, - max_block_len: Int, concurrent_loops: Int, latency: Int, - config_mvin_rs1_t: ConfigMvinRs1, mvin_rs2_t: MvinRs2)(implicit p: Parameters) extends Module { +class LoopConvLdWeight(block_size: Int, coreMaxAddrBits: Int, large_iterator_bitwidth: Int, + small_iterator_bitwidth: Int, tiny_iterator_bitwidth: Int, max_addr: Int, input_w: Int, + max_block_len: Int, concurrent_loops: Int, latency: Int, config_mvin_rs1_t: ConfigMvinRs1, + mvin_rs2_t: MvinRs2)(implicit p: Parameters) extends Module { val MVIN_SCALE_IDENTITY = 0x3f800000.U // TODO get this from configs somehow val io = IO(new Bundle { @@ -460,8 +463,9 @@ class LoopConvLdWeight(block_size: Int, coreMaxAddrBits: Int, large_iterator_bit val dram_addr = req.dram_addr + LoopConv.castDramOffset(dram_offset) val spad_addr = Mux(req.trans_weight_0132, - addr_start + (kch / block_size.U) * krows * kcols * ochs + krow * kcols * ochs + kcol * ochs + och, - addr_start + (och / block_size.U) * krows * kcols * kchs + krow * kcols * kchs + kcol * kchs + kch) + // The width expansions are added here solely to prevent Verilator's "WIDTH" warnings, despite making the code uglier + addr_start + (kch / block_size.U(kch.getWidth.W)) * krows * kcols * ochs + krow * kcols * ochs + kcol * ochs + och, + addr_start + (och / block_size.U(och.getWidth.W)) * krows * kcols * kchs + krow * kcols * kchs + kcol * kchs + kch) // Sizes val J = Mux(req.trans_weight_0132, @@ -646,13 +650,14 @@ class LoopConvExecute(block_size: Int, large_iterator_bitwidth: Int, small_itera // Addresses val a_addr = Mux(req.trans_input_3120, a_addr_start +& (b / block_size.U) * input_spad_stride +& kch * (irows >> req.downsample) * (icols >> req.downsample) +& (irow >> req.downsample) * (icols >> req.downsample) +& (icol >> req.downsample), - a_addr_start +& (kch / block_size.U) * input_spad_stride +& b * (irows >> req.downsample) * (icols >> req.downsample) +& (irow >> req.downsample) * (icols >> req.downsample) +& (icol >> req.downsample)) + a_addr_start +& (kch / block_size.U(kch.getWidth.W)) * input_spad_stride +& b * (irows >> req.downsample) * (icols >> req.downsample) +& (irow >> req.downsample) * (icols >> req.downsample) +& (icol >> req.downsample)) // val c_addr = Mux(ex_overwrite && krow === 0.U && kcol === 0.U && kch === 0.U, d_addr_start, c_addr_start) +& // (och / block_size.U) * batches * orows * ocols +& b * orows * ocols +& orow * ocols +& ocol + // The width expansions are added here solely to prevent Verilator's "WIDTH" warnings, despite making the code uglier val c_addr = c_addr_start +& - (och / block_size.U) * batches * orows * ocols +& b * orows * ocols +& orow * ocols +& ocol + (och / block_size.U(och.getWidth.W)) * batches * orows * ocols +& b * orows * ocols +& orow * ocols +& ocol // val new_weights = b === 0.U && orow === 0.U && ocol === 0.U val new_weights = Reg(Bool()) @@ -660,8 +665,8 @@ class LoopConvExecute(block_size: Int, large_iterator_bitwidth: Int, small_itera val kcol_ = Mux(req.wrot180, kcols - kcol - 1.U, kcol) val b_addr = Mux(req.trans_weight_0132, - b_addr_start +& (kch / block_size.U) * krows * kcols * ochs +& krow_ * kcols * ochs +& kcol_ * ochs +& och, - b_addr_start +& (och / block_size.U) * krows * kcols * kchs +& krow_ * kcols * kchs +& kcol_ * kchs +& kch) + b_addr_start +& (kch / block_size.U(och.getWidth.W)) * krows * kcols * ochs +& krow_ * kcols * ochs +& kcol_ * ochs +& och, + b_addr_start +& (och / block_size.U(och.getWidth.W)) * krows * kcols * kchs +& krow_ * kcols * kchs +& kcol_ * kchs +& kch) class RoCCCommandWithAddr extends Bundle { val cmd = new RoCCCommand @@ -877,10 +882,10 @@ class LoopConvSt(block_size: Int, coreMaxAddrBits: Int, large_iterator_bitwidth: ((orow*out_dim*batch_size +& ocol*batch_size +& b) * out_channels +& och) * (input_w/8).U, ((b*out_dim*out_dim +& orow*out_dim +& ocol) * out_channels +& och) * (input_w/8).U) val dram_addr = req.dram_addr + LoopConv.castDramOffset(dram_offset) - val spad_addr = acc_addr_start +& (och / block_size.U) * batches * orows * ocols +& b * orows * ocols +& orow * ocols +& ocol + val spad_addr = acc_addr_start +& (och / block_size.U(och.getWidth.W)) * batches * orows * ocols +& b * orows * ocols +& orow * ocols +& ocol val pool_dram_addr = req.dram_addr + ((b * pool_out_dim * pool_out_dim) * out_channels + och) * (input_w/8).U - val pool_spad_addr = acc_addr_start +& (och / block_size.U) * batches * orows * ocols +& b * orows * ocols + val pool_spad_addr = acc_addr_start +& (och / block_size.U(och.getWidth.W)) * batches * orows * ocols +& b * orows * ocols // Sizes val I = Mux(ocols - ocol > block_size.U, block_size.U, ocols - ocol) @@ -1116,8 +1121,8 @@ class LoopConvState(val block_size: Int, val large_iterator_bitwidth: Int, val s result.ichs := kchs - result.out_channels_per_bank := result.ochs / block_size.U +& (result.ochs % block_size.U =/= 0.U) - result.in_channels_per_bank := result.ichs / block_size.U +& (result.ichs % block_size.U =/= 0.U) + result.out_channels_per_bank := result.ochs / block_size.U(result.ochs.getWidth.W) +& (result.ochs % block_size.U =/= 0.U) + result.in_channels_per_bank := result.ichs / block_size.U(result.ochs.getWidth.W) +& (result.ichs % block_size.U =/= 0.U) result.bias_spad_stride := batches * orows * ocols result.input_spad_stride := Mux(trans_input_3120, diff --git a/src/main/scala/gemmini/LoopMatmul.scala b/src/main/scala/gemmini/LoopMatmul.scala index 5f564000..86552d56 100644 --- a/src/main/scala/gemmini/LoopMatmul.scala +++ b/src/main/scala/gemmini/LoopMatmul.scala @@ -1,3 +1,4 @@ + package gemmini import chisel3._ diff --git a/src/main/scala/gemmini/ReservationStation.scala b/src/main/scala/gemmini/ReservationStation.scala index 72386aad..cfd60c28 100644 --- a/src/main/scala/gemmini/ReservationStation.scala +++ b/src/main/scala/gemmini/ReservationStation.scala @@ -23,7 +23,8 @@ class ReservationStationIssue[T <: Data](cmd_t: T, id_width: Int) extends Bundle } // TODO we don't need to store the full command in here. We should be able to release the command directly into the relevant controller and only store the associated metadata in the ROB. This would reduce the size considerably -class ReservationStation[T <: Data : Arithmetic, U <: Data, V <: Data](config: GemminiArrayConfig[T, U, V], cmd_t: GemminiCmd) extends Module { +class ReservationStation[T <: Data : Arithmetic, U <: Data, V <: Data](config: GemminiArrayConfig[T, U, V], + cmd_t: GemminiCmd) extends Module { import config._ val block_rows = tileRows * meshRows @@ -251,7 +252,7 @@ class ReservationStation[T <: Data : Arithmetic, U <: Data, V <: Data](config: G val mvout_cols = cmd.rs2(32 + mvout_cols_bits - 1, 32) val mvout_rows = cmd.rs2(48 + mvout_rows_bits - 1, 48) - val mvout_mats = mvout_cols / block_cols.U + (mvout_cols % block_cols.U =/= 0.U) + val mvout_mats = mvout_cols / block_cols.U(mvout_cols_bits.W) + (mvout_cols % block_cols.U =/= 0.U) val total_mvout_rows = ((mvout_mats - 1.U) * block_stride) + mvout_rows op2.bits.end := op2.bits.start + total_mvout_rows @@ -273,7 +274,7 @@ class ReservationStation[T <: Data : Arithmetic, U <: Data, V <: Data](config: G val mvin_cols = cmd.rs2(32 + mvin_cols_bits - 1, 32) val mvin_rows = cmd.rs2(48 + mvin_rows_bits - 1, 48) - val mvin_mats = mvin_cols / block_cols.U + (mvin_cols % block_cols.U =/= 0.U) + val mvin_mats = mvin_cols / block_cols.U(mvin_cols_bits.W) + (mvin_cols % block_cols.U =/= 0.U) val total_mvin_rows = ((mvin_mats - 1.U) * block_stride) + mvin_rows // TODO We have to know how the LoopConv's internals work here. Our abstractions are leaking diff --git a/src/main/scala/gemmini/StoreController.scala b/src/main/scala/gemmini/StoreController.scala index 45ec459c..c9e4fdbb 100644 --- a/src/main/scala/gemmini/StoreController.scala +++ b/src/main/scala/gemmini/StoreController.scala @@ -88,7 +88,7 @@ class StoreController[T <: Data : Arithmetic, U <: Data, V <: Data](config: Gemm val localaddr = mvout_rs2.local_addr val cols = mvout_rs2.num_cols val rows = mvout_rs2.num_rows - val blocks = (cols / block_cols.U) + (cols % block_cols.U =/= 0.U) + val blocks = (cols / block_cols.U(cols.getWidth.W)) + (cols % block_cols.U =/= 0.U) val config_mvout_rs1 = cmd.bits.cmd.rs1.asTypeOf(new ConfigMvoutRs1) val config_mvout_rs2 = cmd.bits.cmd.rs2.asTypeOf(new ConfigMvoutRs2(acc_scale_t_bits, 32)) diff --git a/src/main/scala/gemmini/ZeroWriter.scala b/src/main/scala/gemmini/ZeroWriter.scala index a5c10abe..c2861f71 100644 --- a/src/main/scala/gemmini/ZeroWriter.scala +++ b/src/main/scala/gemmini/ZeroWriter.scala @@ -1,3 +1,4 @@ + package gemmini import chisel3._ @@ -40,7 +41,7 @@ class ZeroWriter[T <: Data, U <: Data, V <: Data, Tag <: Data](config: GemminiAr io.req.ready := !req.valid io.resp.valid := req.valid - io.resp.bits.laddr := req.bits.laddr + req.bits.block_stride * (col_counter / block_cols.U) + io.resp.bits.laddr := req.bits.laddr + req.bits.block_stride * (col_counter / block_cols.U(col_counter.getWidth.W)) // The width expansion here is added solely to satisfy Verilator's linter io.resp.bits.mask.zipWithIndex.foreach { case (m, i) => m := col_counter + i.U < req.bits.cols } io.resp.bits.last := col_counter +& block_cols.U >= req.bits.cols io.resp.bits.tag := req.bits.tag From dc7ffc0ee6acbc946c1585c7c3816c475d0c93b1 Mon Sep 17 00:00:00 2001 From: Hasan Genc Date: Thu, 25 Aug 2022 17:21:28 -0700 Subject: [PATCH 05/64] build gemmini on self-hosted machines in ci --- .github/workflows/config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/config.yml b/.github/workflows/config.yml index cae5f7fd..2a092e8d 100644 --- a/.github/workflows/config.yml +++ b/.github/workflows/config.yml @@ -51,7 +51,7 @@ jobs: prepare-gemmini-config: name: prepare-gemmini-config - runs-on: ubuntu-latest + runs-on: self-hosted needs: [prepare-build-environment, install-esp-toolchain] container: image: ucbbar/chipyard-ci-image:554b436 From 350b3196bab88b75d29c61da70487231b3f2249c Mon Sep 17 00:00:00 2001 From: Jerry Zhao Date: Tue, 22 Nov 2022 15:23:05 -0800 Subject: [PATCH 06/64] Decoupled gemmini dma width from sbus width (#256) This enables correct functionality of gemmini when sbus width != 16 bytes. --- src/main/scala/gemmini/Configs.scala | 2 -- src/main/scala/gemmini/ConfigsFP.scala | 5 ----- src/main/scala/gemmini/CustomConfigs.scala | 1 - src/main/scala/gemmini/DSEConfigs.scala | 12 ------------ src/main/scala/gemmini/Scratchpad.scala | 4 ++-- 5 files changed, 2 insertions(+), 22 deletions(-) diff --git a/src/main/scala/gemmini/Configs.scala b/src/main/scala/gemmini/Configs.scala index 2a060ea9..bd84b317 100644 --- a/src/main/scala/gemmini/Configs.scala +++ b/src/main/scala/gemmini/Configs.scala @@ -254,13 +254,11 @@ class DefaultGemminiConfig[T <: Data : Arithmetic, U <: Data, V <: Data]( gemmini } ) - case SystemBusKey => up(SystemBusKey).copy(beatBytes = 16) }) // This Gemmini config has both an Int and an FP Gemmini side-by-side, sharing // the same scratchpad. class DualGemminiConfig extends Config((site, here, up) => { - case SystemBusKey => up(SystemBusKey).copy(beatBytes = 16) case BuildRoCC => { var int_gemmini: Gemmini[_,_,_] = null var fp_gemmini: Gemmini[_,_,_] = null diff --git a/src/main/scala/gemmini/ConfigsFP.scala b/src/main/scala/gemmini/ConfigsFP.scala index 740ece36..c76907dd 100644 --- a/src/main/scala/gemmini/ConfigsFP.scala +++ b/src/main/scala/gemmini/ConfigsFP.scala @@ -121,7 +121,6 @@ class GemminiFP32DefaultConfig extends Config((site, here, up) => { LazyModule(new Gemmini(GemminiFPConfigs.FP32DefaultConfig)) } ) - case SystemBusKey => up(SystemBusKey).copy(beatBytes = 16) }) @@ -134,7 +133,6 @@ class GemminiFP16DefaultConfig extends Config((site, here, up) => { LazyModule(new Gemmini(GemminiFPConfigs.FP16DefaultConfig)) } ) - case SystemBusKey => up(SystemBusKey).copy(beatBytes = 16) }) //===========BFLOAT16 Default Config========= @@ -146,7 +144,6 @@ class GemminiBF16DefaultConfig extends Config((site, here, up) => { LazyModule(new Gemmini(GemminiFPConfigs.BF16DefaultConfig)) } ) - case SystemBusKey => up(SystemBusKey).copy(beatBytes = 16) }) class GemminiBF16DefaultHighPerfConfig extends Config((site, here, up) => { @@ -161,7 +158,6 @@ class GemminiBF16DefaultHighPerfConfig extends Config((site, here, up) => { gemmini } ) - case SystemBusKey => up(SystemBusKey).copy(beatBytes = 16) }) //===========BFLOAT16 Default Config 8x8========= @@ -173,6 +169,5 @@ class GemminiBF16Default8Config extends Config((site, here, up) => { LazyModule(new Gemmini(GemminiFPConfigs.BF16Default8Config)) } ) - case SystemBusKey => up(SystemBusKey).copy(beatBytes = 16) }) diff --git a/src/main/scala/gemmini/CustomConfigs.scala b/src/main/scala/gemmini/CustomConfigs.scala index 9172e8ee..5b6e0610 100644 --- a/src/main/scala/gemmini/CustomConfigs.scala +++ b/src/main/scala/gemmini/CustomConfigs.scala @@ -64,5 +64,4 @@ class GemminiCustomConfig[T <: Data : Arithmetic, U <: Data, V <: Data]( gemmini } ) - case SystemBusKey => up(SystemBusKey).copy(beatBytes = 16) }) diff --git a/src/main/scala/gemmini/DSEConfigs.scala b/src/main/scala/gemmini/DSEConfigs.scala index a34658e3..3ed92c7c 100644 --- a/src/main/scala/gemmini/DSEConfigs.scala +++ b/src/main/scala/gemmini/DSEConfigs.scala @@ -119,7 +119,6 @@ class GemminiParamsDSE1 extends Config((site, here, up) => { LazyModule(new Gemmini(DSEConfigs.baseConfig)) } ) - case SystemBusKey => up(SystemBusKey).copy(beatBytes = 16) }) //===========DATAFLOW CHANGE: WS========= @@ -131,7 +130,6 @@ class GemminiParamsDSE2 extends Config((site, here, up) => { LazyModule(new Gemmini(DSEConfigs.wsOnlyConfig)) } ) - case SystemBusKey => up(SystemBusKey).copy(beatBytes = 16) }) //===========DATAFLOW CHANGE: BOTH========= @@ -143,7 +141,6 @@ class GemminiParamsDSE3 extends Config((site, here, up) => { LazyModule(new Gemmini(DSEConfigs.bothDataflowsConfig)) } ) - case SystemBusKey => up(SystemBusKey).copy(beatBytes = 16) }) //===========BITWIDTH CHANGE: 32 BITS========= @@ -155,7 +152,6 @@ class GemminiParamsDSE4 extends Config((site, here, up) => { LazyModule(new Gemmini(DSEConfigs.highBitwidthConfig)) } ) - case SystemBusKey => up(SystemBusKey).copy(beatBytes = 16) }) //===========DIMENSIONS CHANGE: 32x32========= @@ -167,7 +163,6 @@ class GemminiParamsDSE5 extends Config((site, here, up) => { LazyModule(new Gemmini(DSEConfigs.largerDimConfig)) } ) - case SystemBusKey => up(SystemBusKey).copy(beatBytes = 16) }) //===========PIPELINE DEPTH CHANGE: Fully Combinational========= @@ -179,7 +174,6 @@ class GemminiParamsDSE6 extends Config((site, here, up) => { LazyModule(new Gemmini(DSEConfigs.fullyCombinationalConfig)) } ) - case SystemBusKey => up(SystemBusKey).copy(beatBytes = 16) }) //===========MEMORY CAPACITY CHANGE: 256 KB========= @@ -191,7 +185,6 @@ class GemminiParamsDSE7 extends Config((site, here, up) => { LazyModule(new Gemmini(DSEConfigs.moreMemoryConfig)) } ) - case SystemBusKey => up(SystemBusKey).copy(beatBytes = 16) }) //===========MEMORY BANKS CHANGE: 33 Banks========= @@ -203,7 +196,6 @@ class GemminiParamsDSE8 extends Config((site, here, up) => { LazyModule(new Gemmini(DSEConfigs.moreBanksConfig)) } ) - case SystemBusKey => up(SystemBusKey).copy(beatBytes = 16) }) //===========BUS WIDTH CHANGE: 64 bits========= @@ -215,7 +207,6 @@ class GemminiParamsDSE10 extends Config((site, here, up) => { LazyModule(new Gemmini(DSEConfigs.narrowerBusConfig)) } ) - case SystemBusKey => up(SystemBusKey).copy(beatBytes = 8) }) //===========PnR 16-by-16========= @@ -227,7 +218,6 @@ class GemminiParamsPnR16 extends Config((site, here, up) => { LazyModule(new Gemmini(DSEConfigs.pnr16Config)) } ) - case SystemBusKey => up(SystemBusKey).copy(beatBytes = 16) }) //===========PnR 32-by-32========= @@ -239,7 +229,6 @@ class GemminiParamsPnR32 extends Config((site, here, up) => { LazyModule(new Gemmini(DSEConfigs.pnr32Config)) } ) - case SystemBusKey => up(SystemBusKey).copy(beatBytes = 16) }) //===========Scalar Processor Change========= @@ -251,7 +240,6 @@ class GemminiParamsDSE11 extends Config((site, here, up) => { LazyModule(new Gemmini(DSEConfigs.baseConfig)) } ) - case SystemBusKey => up(SystemBusKey).copy(beatBytes = 16) }) // ----------------------------- diff --git a/src/main/scala/gemmini/Scratchpad.scala b/src/main/scala/gemmini/Scratchpad.scala index 64b66bde..70c9140f 100644 --- a/src/main/scala/gemmini/Scratchpad.scala +++ b/src/main/scala/gemmini/Scratchpad.scala @@ -7,7 +7,7 @@ import freechips.rocketchip.config.Parameters import freechips.rocketchip.diplomacy.{LazyModule, LazyModuleImp} import freechips.rocketchip.rocket._ import freechips.rocketchip.tile._ -import freechips.rocketchip.tilelink.{TLIdentityNode, TLXbar, TLBuffer} +import freechips.rocketchip.tilelink._ import Util._ @@ -199,7 +199,7 @@ class Scratchpad[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T, xbar_node := TLBuffer() := reader.node // TODO xbar_node := TLBuffer() := writer.node - id_node := TLBuffer() := xbar_node + id_node := TLWidthWidget(config.dma_buswidth/8) := TLBuffer() := xbar_node lazy val module = new LazyModuleImp(this) with HasCoreParameters { From e66e5c075f25419d7c8a19d810b7faf357afaed1 Mon Sep 17 00:00:00 2001 From: Abraham Gonzalez Date: Tue, 22 Nov 2022 18:27:14 -0800 Subject: [PATCH 07/64] Rename counter file module (#247) When testing ucb-bar/chipyard#1239, the new FIRRTL compiler errors when modules are named module. This avoids this issue. --- src/main/scala/gemmini/CounterFile.scala | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/main/scala/gemmini/CounterFile.scala b/src/main/scala/gemmini/CounterFile.scala index 35f50c20..7b28b8e2 100644 --- a/src/main/scala/gemmini/CounterFile.scala +++ b/src/main/scala/gemmini/CounterFile.scala @@ -225,8 +225,8 @@ class CounterController(nPerfCounter: Int, counterWidth: Int)(implicit p: Parame if (nPerfCounter > 0) { val nCounterIndexBit = log2Ceil(nPerfCounter) - val module = Module(new CounterFile(nPerfCounter: Int, counterWidth: Int)) - module.io.event_io <> io.event_io + val counterfile = Module(new CounterFile(nPerfCounter: Int, counterWidth: Int)) + counterfile.io.event_io <> io.event_io val out_reg = Reg(io.out.bits.cloneType) val out_valid_reg = RegInit(false.B) @@ -242,13 +242,13 @@ class CounterController(nPerfCounter: Int, counterWidth: Int)(implicit p: Parame // rs1[31] = External counter flag io.in.ready := !out_valid_reg - module.io.addr := io.in.bits.rs1(nCounterIndexBit + 3, 4) - module.io.counter_reset := io.in.bits.rs1(0) & io.in.fire - module.io.snapshot_reset := io.in.bits.rs1(1) & io.in.fire - module.io.snapshot := io.in.bits.rs1(2) & io.in.fire - module.io.config_address.valid := io.in.bits.rs1(3) & io.in.fire - module.io.config_address.bits := io.in.bits.rs1(17, 12) - module.io.external := io.in.bits.rs1(31) + counterfile.io.addr := io.in.bits.rs1(nCounterIndexBit + 3, 4) + counterfile.io.counter_reset := io.in.bits.rs1(0) & io.in.fire + counterfile.io.snapshot_reset := io.in.bits.rs1(1) & io.in.fire + counterfile.io.snapshot := io.in.bits.rs1(2) & io.in.fire + counterfile.io.config_address.valid := io.in.bits.rs1(3) & io.in.fire + counterfile.io.config_address.bits := io.in.bits.rs1(17, 12) + counterfile.io.external := io.in.bits.rs1(31) when (io.out.fire) { out_valid_reg := false.B @@ -256,7 +256,7 @@ class CounterController(nPerfCounter: Int, counterWidth: Int)(implicit p: Parame out_valid_reg := true.B out_reg.rd := io.in.bits.inst.rd out_reg.data := 0.U - out_reg.data := module.io.data + out_reg.data := counterfile.io.data } io.out.valid := out_valid_reg From ee42df52f0b557831ff3fc21de339bfee802265b Mon Sep 17 00:00:00 2001 From: Hasan Genc Date: Wed, 23 Nov 2022 14:20:09 -0800 Subject: [PATCH 08/64] Merge in some changes from MLSys tutorial (#257) Some of the updates made for the MLSys tutorial are also useful for general users. This PR merges in those changes. --- SPIKE.hash | 2 +- scripts/build-onnx-inference.sh | 1 + scripts/build-onnx-training.sh | 2 +- scripts/build-vcs.sh | 4 ++-- scripts/build-verilator.sh | 6 +++--- scripts/run-midas.sh | 2 +- scripts/run-spike.sh | 2 +- scripts/run-vcs.sh | 2 +- scripts/run-verilator.sh | 2 +- software/gemmini-rocc-tests | 2 +- software/onnxruntime-riscv | 2 +- src/main/scala/gemmini/CustomConfigs.scala | 3 ++- src/main/scala/gemmini/DMA.scala | 3 --- src/main/scala/gemmini/ExecuteController.scala | 3 --- src/main/scala/gemmini/MeshWithDelays.scala | 2 -- src/main/scala/gemmini/ReservationStation.scala | 1 - src/main/scala/gemmini/ZeroWriter.scala | 9 ++++++++- 17 files changed, 24 insertions(+), 24 deletions(-) diff --git a/SPIKE.hash b/SPIKE.hash index 27baea53..8cbb8d37 100644 --- a/SPIKE.hash +++ b/SPIKE.hash @@ -1 +1 @@ -2ed403a70f65559a3c2a06bf724d4737edc73a23 +051d820f08be84d069993de4375d29c91eb2f577 diff --git a/scripts/build-onnx-inference.sh b/scripts/build-onnx-inference.sh index 01d6e8ce..07999b29 100755 --- a/scripts/build-onnx-inference.sh +++ b/scripts/build-onnx-inference.sh @@ -5,3 +5,4 @@ rm -rf ./build/ ./build.sh --parallel --enable_training --config=Debug --cmake_extra_defines onnxruntime_USE_SYSTOLIC=ON onnxruntime_SYSTOLIC_INT8=ON onnxruntime_SYSTOLIC_FP32=OFF cd ./systolic_runner/imagenet_runner/ ./build.sh --parallel --enable_training --config=Debug + diff --git a/scripts/build-onnx-training.sh b/scripts/build-onnx-training.sh index 55c9bc7b..bcb45565 100755 --- a/scripts/build-onnx-training.sh +++ b/scripts/build-onnx-training.sh @@ -1,6 +1,6 @@ #!/bin/bash -cd /root/chipyard/generators/gemmini/software/onnxruntime-riscv/ +cd ./software/onnxruntime-riscv/ rm -rf ./build/ ./build.sh --parallel --enable_training --config=Debug --cmake_extra_defines onnxruntime_USE_SYSTOLIC=ON onnxruntime_SYSTOLIC_INT8=OFF onnxruntime_SYSTOLIC_FP32=ON cd ./systolic_runner/imagenet_trainer/ diff --git a/scripts/build-vcs.sh b/scripts/build-vcs.sh index b15a23f2..23f159b0 100755 --- a/scripts/build-vcs.sh +++ b/scripts/build-vcs.sh @@ -9,7 +9,7 @@ help () { echo "Options:" echo " debug Builds a VCS simulator which generates waveforms. Without this" echo " option, the simulator will not generate any waveforms." - echo " j [N] Allow N jobs at once; infinite jobs with no arg." + echo " j [N] Allow N jobs at once. Default is 1." exit } @@ -21,7 +21,7 @@ while [ $# -gt 0 ] ; do case $1 in -h | --help) show_help=1 ;; --debug) debug="debug" ;; - -j) j=$1 + -j) j=$2; shift esac shift diff --git a/scripts/build-verilator.sh b/scripts/build-verilator.sh index 49a25f29..477c0910 100755 --- a/scripts/build-verilator.sh +++ b/scripts/build-verilator.sh @@ -9,7 +9,7 @@ help () { echo "Options:" echo " debug Builds a Verilator simulator which generates waveforms. Without" echo " this option, the simulator will not generate any waveforms." - echo " j [N] Allow N jobs at once; infinite jobs with no arg." + echo " j [N] Allow N jobs at once. Default is 1." exit } @@ -21,7 +21,7 @@ while [ $# -gt 0 ] ; do case $1 in -h | --help) show_help=1 ;; --debug) debug="debug" ;; - -j) j=$1 + -j) j=$2; shift esac shift @@ -32,5 +32,5 @@ if [ $show_help -eq 1 ]; then fi cd ../../sims/verilator/ -make ${debug} CONFIG=CustomGemminiSoCConfig +make -j$j ${debug} CONFIG=CustomGemminiSoCConfig diff --git a/scripts/run-midas.sh b/scripts/run-midas.sh index 806b21b1..63616809 100755 --- a/scripts/run-midas.sh +++ b/scripts/run-midas.sh @@ -94,7 +94,7 @@ fi path="" suffix="" -for dir in bareMetalC mlps imagenet ; do +for dir in bareMetalC mlps imagenet transformers ; do if [ -f "software/gemmini-rocc-tests/build/${dir}/${binary}$default_suffix" ]; then path="${ROOT}/software/gemmini-rocc-tests/build/${dir}/" suffix=$default_suffix diff --git a/scripts/run-spike.sh b/scripts/run-spike.sh index 00b5349f..1638b76c 100755 --- a/scripts/run-spike.sh +++ b/scripts/run-spike.sh @@ -60,7 +60,7 @@ fi path="" suffix="" -for dir in bareMetalC mlps imagenet ; do +for dir in bareMetalC mlps imagenet transformers ; do if [ -f "software/gemmini-rocc-tests/build/${dir}/${binary}$default_suffix" ]; then path="software/gemmini-rocc-tests/build/${dir}/" suffix=$default_suffix diff --git a/scripts/run-vcs.sh b/scripts/run-vcs.sh index 0fcbd9b1..40ce9bda 100755 --- a/scripts/run-vcs.sh +++ b/scripts/run-vcs.sh @@ -73,7 +73,7 @@ fi path="" suffix="" -for dir in bareMetalC mlps imagenet ; do +for dir in bareMetalC mlps imagenet transformers ; do if [ -f "software/gemmini-rocc-tests/build/${dir}/${binary}$default_suffix" ]; then path="${ROOT}/software/gemmini-rocc-tests/build/${dir}/" suffix=$default_suffix diff --git a/scripts/run-verilator.sh b/scripts/run-verilator.sh index 58d40d2b..b4f21458 100755 --- a/scripts/run-verilator.sh +++ b/scripts/run-verilator.sh @@ -73,7 +73,7 @@ fi path="" suffix="" -for dir in bareMetalC mlps imagenet ; do +for dir in bareMetalC mlps imagenet transformers ; do if [ -f "software/gemmini-rocc-tests/build/${dir}/${binary}$default_suffix" ]; then path="${ROOT}/software/gemmini-rocc-tests/build/${dir}/" suffix=$default_suffix diff --git a/software/gemmini-rocc-tests b/software/gemmini-rocc-tests index 70f8e58b..b631f97c 160000 --- a/software/gemmini-rocc-tests +++ b/software/gemmini-rocc-tests @@ -1 +1 @@ -Subproject commit 70f8e58b8113b288f0937a4777fe582437ea36ce +Subproject commit b631f97c371a52b2cd4fb1f4ec956bbbe86fb34a diff --git a/software/onnxruntime-riscv b/software/onnxruntime-riscv index 0c8c9b4f..daa8999f 160000 --- a/software/onnxruntime-riscv +++ b/software/onnxruntime-riscv @@ -1 +1 @@ -Subproject commit 0c8c9b4f881b5f31d32c6b5a76cac4ee14a8f338 +Subproject commit daa8999f80d5a233b6c478039f548751b7f02f38 diff --git a/src/main/scala/gemmini/CustomConfigs.scala b/src/main/scala/gemmini/CustomConfigs.scala index 5b6e0610..ae529a69 100644 --- a/src/main/scala/gemmini/CustomConfigs.scala +++ b/src/main/scala/gemmini/CustomConfigs.scala @@ -41,7 +41,7 @@ object GemminiCustomConfigs { acc_capacity = CapacityInKilobytes(128), ) - val bertInferenceConfig = defaultConfig.copy( + val ibertInferenceConfig = defaultConfig.copy( has_training_convs = false, has_max_pool = false, has_normalizations = true, @@ -65,3 +65,4 @@ class GemminiCustomConfig[T <: Data : Arithmetic, U <: Data, V <: Data]( } ) }) + diff --git a/src/main/scala/gemmini/DMA.scala b/src/main/scala/gemmini/DMA.scala index 41164f30..71148b67 100644 --- a/src/main/scala/gemmini/DMA.scala +++ b/src/main/scala/gemmini/DMA.scala @@ -463,9 +463,6 @@ class StreamWriter[T <: Data: Arithmetic](nXacts: Int, beatBits: Int, maxBytes: } val write_packet = RegEnableThru(best_write_packet, state === s_writing_new_block) - for (wp <- write_packets) - dontTouch(wp) - val write_size = write_packet.size val lg_write_size = write_packet.lg_size val write_beats = write_packet.total_beats() diff --git a/src/main/scala/gemmini/ExecuteController.scala b/src/main/scala/gemmini/ExecuteController.scala index 514d918e..2ef7fa3f 100644 --- a/src/main/scala/gemmini/ExecuteController.scala +++ b/src/main/scala/gemmini/ExecuteController.scala @@ -508,8 +508,6 @@ class ExecuteController[T <: Data, U <: Data, V <: Data](xLen: Int, tagWidth: In when (read_a && !io.im2col.req.ready) { a_ready := false.B } - dontTouch(io.im2col.req.ready) - dontTouch(read_a) io.im2col.req.valid := read_a io.im2col.req.bits.addr := a_address_rs1 @@ -994,7 +992,6 @@ class ExecuteController[T <: Data, U <: Data, V <: Data](xLen: Int, tagWidth: In when(io.completed.valid) { complete_bits_count := complete_bits_count + 1.U } - dontTouch(complete_bits_count) when (reset.asBool()) { // pending_completed_rob_id.valid := false.B diff --git a/src/main/scala/gemmini/MeshWithDelays.scala b/src/main/scala/gemmini/MeshWithDelays.scala index f6cf7517..d0aced16 100644 --- a/src/main/scala/gemmini/MeshWithDelays.scala +++ b/src/main/scala/gemmini/MeshWithDelays.scala @@ -232,8 +232,6 @@ class MeshWithDelays[T <: Data: Arithmetic, U <: TagQueueTag with Data] val out_matmul_id = WireInit(shifted(mesh.io.out_id, outBanks, reverse = true)(0)(0)) io.resp.bits.tag := Mux(tagq.io.deq.valid && out_matmul_id === tagq.io.deq.bits.id, tagq.io.deq.bits.tag, tag_garbage) - dontTouch(out_matmul_id) - tagq.io.deq.ready := io.resp.valid && io.resp.bits.last && out_matmul_id === tagq.io.deq.bits.id val total_rows_q = Module(new Queue(new TagWithIdAndTotalRows, tagqlen)) diff --git a/src/main/scala/gemmini/ReservationStation.scala b/src/main/scala/gemmini/ReservationStation.scala index cfd60c28..68d0e6e7 100644 --- a/src/main/scala/gemmini/ReservationStation.scala +++ b/src/main/scala/gemmini/ReservationStation.scala @@ -180,7 +180,6 @@ class ReservationStation[T <: Data : Arithmetic, U <: Data, V <: Data](config: G val alloc_fire = io.alloc.fire() - dontTouch(new_entry) io.alloc.ready := false.B when (io.alloc.valid) { val spAddrBits = 32 diff --git a/src/main/scala/gemmini/ZeroWriter.scala b/src/main/scala/gemmini/ZeroWriter.scala index c2861f71..a1834a41 100644 --- a/src/main/scala/gemmini/ZeroWriter.scala +++ b/src/main/scala/gemmini/ZeroWriter.scala @@ -41,7 +41,14 @@ class ZeroWriter[T <: Data, U <: Data, V <: Data, Tag <: Data](config: GemminiAr io.req.ready := !req.valid io.resp.valid := req.valid - io.resp.bits.laddr := req.bits.laddr + req.bits.block_stride * (col_counter / block_cols.U(col_counter.getWidth.W)) // The width expansion here is added solely to satisfy Verilator's linter + io.resp.bits.laddr := req.bits.laddr + req.bits.block_stride * { + // This code block was originally just "col_counter / block_cols.U". We + // changed it to satisfy Verilator's linter + if (col_counter.getWidth >= log2Ceil(block_cols+1)) + (col_counter / block_cols.U(col_counter.getWidth.W)) + else + 0.U + } io.resp.bits.mask.zipWithIndex.foreach { case (m, i) => m := col_counter + i.U < req.bits.cols } io.resp.bits.last := col_counter +& block_cols.U >= req.bits.cols io.resp.bits.tag := req.bits.tag From d2922c605fac2ec580e394081944ac4f21ad7924 Mon Sep 17 00:00:00 2001 From: Hasan Genc Date: Wed, 23 Nov 2022 14:24:35 -0800 Subject: [PATCH 09/64] bump CHIPYARD.hash to Chipyard 1.8.1 --- CHIPYARD.hash | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHIPYARD.hash b/CHIPYARD.hash index b154a058..7fb91902 100644 --- a/CHIPYARD.hash +++ b/CHIPYARD.hash @@ -1 +1 @@ -481398b910fa95ec88dd578c67ba358a4d83129d +004297b6a8c01be1b2110c4cf4f9393ae1ff8805 From 65bd41d034cc1553ba6d28f2865ea89e710b4d6c Mon Sep 17 00:00:00 2001 From: Hasan Genc Date: Wed, 23 Nov 2022 15:39:26 -0800 Subject: [PATCH 10/64] fixes for chipyard 1.8.1 --- .github/scripts/build-toolchains.sh | 2 +- scripts/build-midas.sh | 2 +- src/main/scala/gemmini/CustomSoCConfigs.scala | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/scripts/build-toolchains.sh b/.github/scripts/build-toolchains.sh index fa6017ea..b23d854e 100755 --- a/.github/scripts/build-toolchains.sh +++ b/.github/scripts/build-toolchains.sh @@ -30,6 +30,6 @@ if [ ! -d "$INSTALL_DIR" ]; then cd $HOME # init all submodules including the tools (doesn't use CI_MAKE_PROC due to mem. constraints) - CHIPYARD_DIR="$LOCAL_CHIPYARD_DIR" NPROC=$CI_MAKE_NPROC $LOCAL_CHIPYARD_DIR/scripts/build-toolchains.sh esp-tools + CHIPYARD_DIR="$LOCAL_CHIPYARD_DIR" NPROC=$CI_MAKE_NPROC $LOCAL_CHIPYARD_DIR/build-setup.sh --skip-conda esp-tools fi diff --git a/scripts/build-midas.sh b/scripts/build-midas.sh index c966513c..ba624087 100755 --- a/scripts/build-midas.sh +++ b/scripts/build-midas.sh @@ -54,7 +54,7 @@ if [ dram_model == "" ]; then fi cd ../../sims/firesim/ -source sourceme-f1-manager.sh &> build.log +source sourceme-f1-manager.sh --skip-ssh-setup &> build.log cd sim/ make ${simulator}${debug} TARGET_CONFIG=${dram_model}_WithDefaultFireSimBridges_WithFireSimConfigTweaks_chipyard.CustomGemminiSoCConfig diff --git a/src/main/scala/gemmini/CustomSoCConfigs.scala b/src/main/scala/gemmini/CustomSoCConfigs.scala index aebfb520..057aa1e1 100644 --- a/src/main/scala/gemmini/CustomSoCConfigs.scala +++ b/src/main/scala/gemmini/CustomSoCConfigs.scala @@ -10,10 +10,10 @@ class CustomGemminiSoCConfig extends Config( new chipyard.config.WithL2TLBs(512) ++ new freechips.rocketchip.subsystem.WithInclusiveCache( - nBanks = 1, nWays = 8, capacityKB = 512, - outerLatencyCycles = 40 + outerLatencyCycles = 40, + subBankingFactor = 4 ) ++ // Set the number of CPUs you want to create From 6bdf36d59234f377829468b97a712b854110e66d Mon Sep 17 00:00:00 2001 From: Hasan Genc Date: Sat, 26 Nov 2022 09:39:26 -0800 Subject: [PATCH 11/64] updates to ci --- .github/scripts/build-toolchains.sh | 35 --------- .github/scripts/defaults.sh | 2 + .github/scripts/do-rtl-build.sh | 31 +++++--- .github/scripts/enable-conda.sh | 13 ++++ .github/scripts/install-gemmini.sh | 42 +++++++++++ .github/scripts/install-verilator.sh | 20 ----- .github/scripts/prepare-for-rtl-build.sh | 23 ------ .github/scripts/remove-chipyard.sh | 10 +++ .github/scripts/run-tests-rtl.sh | 6 +- .github/scripts/run-tests-spike.sh | 24 +++--- .github/workflows/config.yml | 96 +++++++++--------------- 11 files changed, 138 insertions(+), 164 deletions(-) delete mode 100755 .github/scripts/build-toolchains.sh create mode 100644 .github/scripts/enable-conda.sh create mode 100755 .github/scripts/install-gemmini.sh delete mode 100755 .github/scripts/install-verilator.sh delete mode 100755 .github/scripts/prepare-for-rtl-build.sh create mode 100755 .github/scripts/remove-chipyard.sh diff --git a/.github/scripts/build-toolchains.sh b/.github/scripts/build-toolchains.sh deleted file mode 100755 index b23d854e..00000000 --- a/.github/scripts/build-toolchains.sh +++ /dev/null @@ -1,35 +0,0 @@ -#!/bin/bash - -#------------------------------------------------------------- -# create the riscv tools binaries from ucb-bar/chipyard with rocket-chip hash given by riscv-boom -# -# run location: circle ci docker image -# usage: -# $1 - name of the toolchain to build -#------------------------------------------------------------- - -# turn echo on and error on earliest command -set -ex - -# get shared variables -SCRIPT_DIR="$( cd "$( dirname "$0" )" && pwd )" -source $SCRIPT_DIR/defaults.sh - -INSTALL_DIR="$HOME/$1-install" - -if [ ! -d "$INSTALL_DIR" ]; then - cd $HOME - - git clone --progress --verbose https://github.com/ucb-bar/chipyard.git chipyard - cd $LOCAL_CHIPYARD_DIR - - echo "Checking out Chipyard version: $(cat $LOCAL_CHECKOUT_DIR/CHIPYARD.hash)" - git fetch - git checkout $(cat $LOCAL_CHECKOUT_DIR/CHIPYARD.hash) - - cd $HOME - - # init all submodules including the tools (doesn't use CI_MAKE_PROC due to mem. constraints) - CHIPYARD_DIR="$LOCAL_CHIPYARD_DIR" NPROC=$CI_MAKE_NPROC $LOCAL_CHIPYARD_DIR/build-setup.sh --skip-conda esp-tools -fi - diff --git a/.github/scripts/defaults.sh b/.github/scripts/defaults.sh index 6a02a220..e403fc89 100755 --- a/.github/scripts/defaults.sh +++ b/.github/scripts/defaults.sh @@ -28,6 +28,7 @@ LOCAL_ESP_DIR=$HOME/esp-tools-install LOCAL_CHIPYARD_DIR=$HOME/chipyard LOCAL_SIM_DIR=$LOCAL_CHIPYARD_DIR/sims/verilator LOCAL_VERILATOR_DIR=$HOME/verilator-install +LOCAL_CONDA=/opt/conda/ echo "::set-output name=LOCAL_WORK_DIR::$LOCAL_WORK_DIR" echo "::set-output name=LOCAL_CHECKOUT_DIR::$LOCAL_CHECKOUT_DIR" @@ -36,3 +37,4 @@ echo "::set-output name=LOCAL_ESP_DIR::$LOCAL_ESP_DIR" echo "::set-output name=LOCAL_CHIPYARD_DIR::$LOCAL_CHIPYARD_DIR" echo "::set-output name=LOCAL_SIM_DIR::$LOCAL_SIM_DIR" echo "::set-output name=LOCAL_VERILATOR_DIR::$LOCAL_VERILATOR_DIR" +echo "::set-output name=LOCAL_CONDA::$LOCAL_CONDA" diff --git a/.github/scripts/do-rtl-build.sh b/.github/scripts/do-rtl-build.sh index 1b93655e..102aaa60 100755 --- a/.github/scripts/do-rtl-build.sh +++ b/.github/scripts/do-rtl-build.sh @@ -7,20 +7,27 @@ set -ex SCRIPT_DIR="$( cd "$( dirname "$0" )" && pwd )" source $SCRIPT_DIR/defaults.sh -rm -rf $LOCAL_CHIPYARD_DIR/generators/gemmini/* -cd $LOCAL_CHECKOUT_DIR -git submodule update --init --recursive software/gemmini-rocc-tests -mv -f $LOCAL_CHECKOUT_DIR/* $LOCAL_CHIPYARD_DIR/generators/gemmini/ +source $SCRIPT_DIR/enable-conda.sh +cd $LOCAL_CHIPYARD_DIR +source env.sh -TOOLS_DIR=$LOCAL_ESP_DIR -LD_LIB_DIR=$LOCAL_ESP_DIR/lib +cd $LOCAL_CHIPYARD_DIR +echo Printing current chipyard commit +git log -1 --format="%H" -# enter the verilator directory and build the specific config on remote server +cd $LOCAL_CHIPYARD_DIR/generators/gemmini +echo Printing current gemmini commit +git log -1 --format="%H" + +cd $LOCAL_CHIPYARD_DIR/generators/rocket-chip +echo Printing rocket-chip commit +git log -1 --format="%H" + +echo Printing rocket-chip sources +ls src/main/scala/ + +cd $LOCAL_SIM_DIR make -C $LOCAL_SIM_DIR clean -export RISCV=$TOOLS_DIR -export LD_LIBRARY_PATH=$LD_LIB_DIR -export PATH=$LOCAL_VERILATOR_DIR/bin:$PATH -export VERILATOR_ROOT=$LOCAL_VERILATOR_DIR -export COURSIER_CACHE=$LOCAL_WORK_DIR/.coursier-cache make -j$LOCAL_MAKE_NPROC -C $LOCAL_SIM_DIR VERILATOR_OPT_FLAGS="-O0 -OG" JAVA_OPTS="-Xmx2500M -Xss8M" SBT_OPTS="-Dsbt.ivy.home=$LOCAL_CHIPYARD_DIR/.ivy2 -Dsbt.supershell=false -Dsbt.global.base=$LOCAL_CHIPYARD_DIR/.sbt -Dsbt.boot.directory=$LOCAL_CHIPYARD_DIR/.sbt/boot" CONFIG=GemminiRocketConfig + diff --git a/.github/scripts/enable-conda.sh b/.github/scripts/enable-conda.sh new file mode 100644 index 00000000..184ead9b --- /dev/null +++ b/.github/scripts/enable-conda.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +export PATH="$LOCAL_CONDA/bin:$PATH" +conda init +source ~/.bashrc +conda activate base +if ! { conda env list | grep 'chipyard'; } >/dev/null 2>&1; then + conda create -n chipyard + conda activate chipyard + conda install -c conda-forge conda-lock +fi +conda activate chipyard + diff --git a/.github/scripts/install-gemmini.sh b/.github/scripts/install-gemmini.sh new file mode 100755 index 00000000..8a2ef5cd --- /dev/null +++ b/.github/scripts/install-gemmini.sh @@ -0,0 +1,42 @@ +#!/bin/bash + +#------------------------------------------------------------- +# installs gemmini +# +# run location: circle ci docker image +#------------------------------------------------------------- + +# turn echo on and error on earliest command +set -ex + +# get shared variables +SCRIPT_DIR="$( cd "$( dirname "$0" )" && pwd )" +source $SCRIPT_DIR/defaults.sh + +source $SCRIPT_DIR/enable-conda.sh + +cd $HOME +rm -rf chipyard +git clone --progress --verbose https://github.com/ucb-bar/chipyard.git chipyard +cd $LOCAL_CHIPYARD_DIR + +echo "Checking out Chipyard version: $(cat $LOCAL_CHECKOUT_DIR/CHIPYARD.hash)" +git fetch +git checkout $(cat $LOCAL_CHECKOUT_DIR/CHIPYARD.hash) + +./build-setup.sh esp-tools + +source env.sh + +cd toolchains/esp-tools/riscv-isa-sim/build +echo "Checking out Spike version $(cat $LOCAL_CHECKOUT_DIR/SPIKE.hash)" +git checkout $(cat $LOCAL_CHECKOUT_DIR/SPIKE.hash) +make && make install + +cd $LOCAL_CHECKOUT_DIR +chown -R $(whoami) . +git config --global --add safe.directory $LOCAL_CHECKOUT_DIR +git submodule update --init --recursive software/gemmini-rocc-tests +rm -rf $LOCAL_CHIPYARD_DIR/generators/gemmini/* $LOCAL_CHIPYARD_DIR/generators/gemmini/.git* +mv -f $LOCAL_CHECKOUT_DIR/* $LOCAL_CHECKOUT_DIR/.git* $LOCAL_CHIPYARD_DIR/generators/gemmini/ + diff --git a/.github/scripts/install-verilator.sh b/.github/scripts/install-verilator.sh deleted file mode 100755 index b996b4d0..00000000 --- a/.github/scripts/install-verilator.sh +++ /dev/null @@ -1,20 +0,0 @@ -#!/bin/bash - -# move verilator to the remote server - -# turn echo on and error on earliest command -set -ex - -# get shared variables -SCRIPT_DIR="$( cd "$( dirname "$0" )" && pwd )" -source $SCRIPT_DIR/defaults.sh - -if [ ! -d "$LOCAL_VERILATOR_DIR" ]; then - git clone http://git.veripool.org/git/verilator $LOCAL_VERILATOR_DIR - cd $LOCAL_VERILATOR_DIR - git checkout $VERILATOR_VERSION - autoconf - export VERILATOR_ROOT=$LOCAL_VERILATOR_DIR - ./configure - make -j$LOCAL_MAKE_NPROC -fi diff --git a/.github/scripts/prepare-for-rtl-build.sh b/.github/scripts/prepare-for-rtl-build.sh deleted file mode 100755 index df3ac470..00000000 --- a/.github/scripts/prepare-for-rtl-build.sh +++ /dev/null @@ -1,23 +0,0 @@ -#!/bin/bash - -# turn echo on and error on earliest command -set -ex - -# get shared variables -SCRIPT_DIR="$( cd "$( dirname "$0" )" && pwd )" -source $SCRIPT_DIR/defaults.sh - -# check to see if both dirs exist -if [ ! -d "$LOCAL_CHIPYARD_DIR" ]; then - cd $HOME - - git clone --progress --verbose https://github.com/ucb-bar/chipyard.git chipyard - cd $LOCAL_CHIPYARD_DIR - - echo "Checking out Chipyard version: $(cat $LOCAL_CHECKOUT_DIR/CHIPYARD.hash)" - git fetch - git checkout $(cat $LOCAL_CHECKOUT_DIR/CHIPYARD.hash) - - # init all submodules (according to what chipyard wants) - ./scripts/init-submodules-no-riscv-tools.sh -fi diff --git a/.github/scripts/remove-chipyard.sh b/.github/scripts/remove-chipyard.sh new file mode 100755 index 00000000..8b82019e --- /dev/null +++ b/.github/scripts/remove-chipyard.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +set -ex + +SCRIPT_DIR="$( cd "$( dirname "$0" )" && pwd )" +source $SCRIPT_DIR/defaults.sh + +rm -rf $LOCAL_CHIPYARD_DIR +rm -rf $LOCAL_CONDA + diff --git a/.github/scripts/run-tests-rtl.sh b/.github/scripts/run-tests-rtl.sh index c5907ddd..47a87ff1 100755 --- a/.github/scripts/run-tests-rtl.sh +++ b/.github/scripts/run-tests-rtl.sh @@ -5,9 +5,10 @@ set -ex SCRIPT_DIR="$( cd "$( dirname "$0" )" && pwd )" source $SCRIPT_DIR/defaults.sh +source $SCRIPT_DIR/enable-conda.sh -TOOLS_DIR=$LOCAL_ESP_DIR -PATH=$PATH:$LOCAL_ESP_DIR/bin +cd $LOCAL_CHIPYARD_DIR +source env.sh cd $LOCAL_CHIPYARD_DIR/generators/gemmini/software/gemmini-rocc-tests CFLAGS=-DFAST ./build.sh @@ -15,4 +16,3 @@ CFLAGS=-DFAST ./build.sh cd build make test-baremetal-bareMetalC RUNNER="'make -C $LOCAL_CHIPYARD_DIR/sims/verilator/ CONFIG=GemminiRocketConfig run-binary-hex BINARY='" - diff --git a/.github/scripts/run-tests-spike.sh b/.github/scripts/run-tests-spike.sh index 9f933aaf..c6dbf850 100755 --- a/.github/scripts/run-tests-spike.sh +++ b/.github/scripts/run-tests-spike.sh @@ -5,23 +5,25 @@ set -ex SCRIPT_DIR="$( cd "$( dirname "$0" )" && pwd )" source $SCRIPT_DIR/defaults.sh +source $SCRIPT_DIR/enable-conda.sh -# clone and build our version of spike -TOOLS_DIR=$LOCAL_ESP_DIR -PATH=$PATH:$LOCAL_ESP_DIR/bin +cd $LOCAL_CHIPYARD_DIR +source env.sh -git clone https://github.com/ucb-bar/esp-isa-sim.git -cd esp-isa-sim -git checkout $(cat $LOCAL_CHECKOUT_DIR/SPIKE.hash) -cp $LOCAL_CHIPYARD_DIR/generators/gemmini/software/gemmini-rocc-tests/include/gemmini_params.h gemmini/ +cd $LOCAL_CHIPYARD_DIR/toolchains/esp-tools/riscv-isa-sim +echo Printing current spike commit +git log -1 --format="%H" -mkdir build -cd build -../configure --prefix=$TOOLS_DIR -make -j8 install +cd $LOCAL_CHIPYARD_DIR/generators/gemmini +echo Printing current gemmini commit +git log -1 --format="%H" cd $LOCAL_CHIPYARD_DIR/generators/gemmini/software/gemmini-rocc-tests +echo Printing current gemmini-rocc-tests commit +git log -1 --format="%H" + ./build.sh cd build make test-baremetal + diff --git a/.github/workflows/config.yml b/.github/workflows/config.yml index 2a092e8d..f4a2dfda 100644 --- a/.github/workflows/config.yml +++ b/.github/workflows/config.yml @@ -1,11 +1,11 @@ name: Gemmini CI on: [push] jobs: - install-esp-toolchain: - name: install-esp-toolchain + install-gemmini: + name: gemmini-install runs-on: ubuntu-latest container: - image: ucbbar/chipyard-ci-image:554b436 + image: ucbbar/chipyard-ci-image:3f9150 options: --entrypoint /bin/bash steps: - name: checkout @@ -14,47 +14,23 @@ jobs: run: .github/scripts/defaults.sh id: get-paths - - name: toolchain-build - run: .github/scripts/build-toolchains.sh esp-tools + - name: install gemmini + run: .github/scripts/install-gemmini.sh - - name: cache esp-toolchain install - uses: actions/cache@v2 - with: - path: ${{ steps.get-paths.outputs.LOCAL_ESP_DIR }} - key: esp-tools-install-${{ github.ref }}-${{ github.sha }} - - prepare-build-environment: - name: prepare-build-environment - runs-on: ubuntu-latest - container: - image: ucbbar/chipyard-ci-image:554b436 - options: --entrypoint /bin/bash - steps: - - name: checkout - uses: actions/checkout@v2 - - name: get paths - run: .github/scripts/defaults.sh - id: get-paths - - - name: setup build environment - run: .github/scripts/prepare-for-rtl-build.sh - - name: install verilator - run: .github/scripts/install-verilator.sh - - - name: cache prepare-build-environment install + - name: cache gemmini install uses: actions/cache@v2 with: path: | ${{ steps.get-paths.outputs.LOCAL_CHIPYARD_DIR }} - ${{ steps.get-paths.outputs.LOCAL_VERILATOR_DIR }} - key: prepare-build-environment-${{ github.ref }}-${{ github.sha }} + ${{ steps.get-paths.outputs.LOCAL_CONDA }} + key: gemmini-install-${{ github.ref }}-${{ github.sha }} - prepare-gemmini-config: - name: prepare-gemmini-config + build-gemmini-config: + name: build-gemmini-config runs-on: self-hosted - needs: [prepare-build-environment, install-esp-toolchain] + needs: install-gemmini container: - image: ucbbar/chipyard-ci-image:554b436 + image: ucbbar/chipyard-ci-image:3f9150 options: --entrypoint /bin/bash steps: - name: checkout @@ -63,38 +39,34 @@ jobs: run: .github/scripts/defaults.sh id: get-paths - - name: restore cache esp-toolchain install - uses: actions/cache@v2 - with: - path: ${{ steps.get-paths.outputs.LOCAL_ESP_DIR }} - key: esp-tools-install-${{ github.ref }}-${{ github.sha }} + - name: remove chipyard + run: .github/scripts/remove-chipyard.sh - - name: restore cache prepare-build-environment install + - name: restore cache gemmini install uses: actions/cache@v2 with: path: | ${{ steps.get-paths.outputs.LOCAL_CHIPYARD_DIR }} - ${{ steps.get-paths.outputs.LOCAL_VERILATOR_DIR }} - key: prepare-build-environment-${{ github.ref }}-${{ github.sha }} + ${{ steps.get-paths.outputs.LOCAL_CONDA }} + key: gemmini-install-${{ github.ref }}-${{ github.sha }} - name: Building Gemmini Config using Verilator run: .github/scripts/do-rtl-build.sh - - name: cache prepare-gemmini-config install + - name: cache build-gemmini-config install uses: actions/cache@v2 with: path: | ${{ steps.get-paths.outputs.LOCAL_CHIPYARD_DIR }} - ${{ steps.get-paths.outputs.LOCAL_VERILATOR_DIR }} - ${{ steps.get-paths.outputs.LOCAL_ESP_DIR }} - key: prepare-gemmini-config-${{ github.ref }}-${{ github.sha }} + ${{ steps.get-paths.outputs.LOCAL_CONDA }} + key: build-gemmini-config-${{ github.ref }}-${{ github.sha }} spike-run-tests: name: spike-run-tests runs-on: ubuntu-latest - needs: prepare-gemmini-config + needs: install-gemmini container: - image: ucbbar/chipyard-ci-image:554b436 + image: ucbbar/chipyard-ci-image:3f9150 options: --entrypoint /bin/bash steps: - name: checkout @@ -103,14 +75,16 @@ jobs: run: .github/scripts/defaults.sh id: get-paths - - name: restore cache prepare-gemmini-config install + - name: remove chipyard + run: .github/scripts/remove-chipyard.sh + + - name: restore cache gemmini install uses: actions/cache@v2 with: path: | ${{ steps.get-paths.outputs.LOCAL_CHIPYARD_DIR }} - ${{ steps.get-paths.outputs.LOCAL_VERILATOR_DIR }} - ${{ steps.get-paths.outputs.LOCAL_ESP_DIR }} - key: prepare-gemmini-config-${{ github.ref }}-${{ github.sha }} + ${{ steps.get-paths.outputs.LOCAL_CONDA }} + key: gemmini-install-${{ github.ref }}-${{ github.sha }} - name: run-tests run: .github/scripts/run-tests-spike.sh @@ -118,9 +92,9 @@ jobs: rtl-run-tests: name: rtl-run-tests runs-on: ubuntu-latest - needs: prepare-gemmini-config + needs: build-gemmini-config container: - image: ucbbar/chipyard-ci-image:554b436 + image: ucbbar/chipyard-ci-image:3f9150 options: --entrypoint /bin/bash steps: - name: checkout @@ -129,14 +103,16 @@ jobs: run: .github/scripts/defaults.sh id: get-paths - - name: restore cache prepare-gemmini-config install + - name: remove chipyard + run: .github/scripts/remove-chipyard.sh + + - name: restore cache build-gemmini-config install uses: actions/cache@v2 with: path: | ${{ steps.get-paths.outputs.LOCAL_CHIPYARD_DIR }} - ${{ steps.get-paths.outputs.LOCAL_VERILATOR_DIR }} - ${{ steps.get-paths.outputs.LOCAL_ESP_DIR }} - key: prepare-gemmini-config-${{ github.ref }}-${{ github.sha }} + ${{ steps.get-paths.outputs.LOCAL_CONDA }} + key: build-gemmini-config-${{ github.ref }}-${{ github.sha }} - name: run-tests run: .github/scripts/run-tests-rtl.sh From c1ccf7dc6ed2636ca0eff836880d9d2ef481b79f Mon Sep 17 00:00:00 2001 From: Hasan Genc Date: Sat, 26 Nov 2022 09:43:31 -0800 Subject: [PATCH 12/64] bump gemmini-rocc-tests --- software/gemmini-rocc-tests | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/software/gemmini-rocc-tests b/software/gemmini-rocc-tests index b631f97c..ae0cd823 160000 --- a/software/gemmini-rocc-tests +++ b/software/gemmini-rocc-tests @@ -1 +1 @@ -Subproject commit b631f97c371a52b2cd4fb1f4ec956bbbe86fb34a +Subproject commit ae0cd8236d32fccf7197a7ac0634df5513cec4db From beb3ee5f2246a153034be161e278b588bf4833a2 Mon Sep 17 00:00:00 2001 From: Hasan Genc Date: Sat, 26 Nov 2022 09:47:03 -0800 Subject: [PATCH 13/64] remove unnecessary echos from ci --- .github/scripts/do-rtl-build.sh | 15 --------------- .github/scripts/install-gemmini.sh | 2 -- .github/scripts/run-tests-spike.sh | 11 ----------- 3 files changed, 28 deletions(-) diff --git a/.github/scripts/do-rtl-build.sh b/.github/scripts/do-rtl-build.sh index 102aaa60..38651571 100755 --- a/.github/scripts/do-rtl-build.sh +++ b/.github/scripts/do-rtl-build.sh @@ -12,21 +12,6 @@ source $SCRIPT_DIR/enable-conda.sh cd $LOCAL_CHIPYARD_DIR source env.sh -cd $LOCAL_CHIPYARD_DIR -echo Printing current chipyard commit -git log -1 --format="%H" - -cd $LOCAL_CHIPYARD_DIR/generators/gemmini -echo Printing current gemmini commit -git log -1 --format="%H" - -cd $LOCAL_CHIPYARD_DIR/generators/rocket-chip -echo Printing rocket-chip commit -git log -1 --format="%H" - -echo Printing rocket-chip sources -ls src/main/scala/ - cd $LOCAL_SIM_DIR make -C $LOCAL_SIM_DIR clean make -j$LOCAL_MAKE_NPROC -C $LOCAL_SIM_DIR VERILATOR_OPT_FLAGS="-O0 -OG" JAVA_OPTS="-Xmx2500M -Xss8M" SBT_OPTS="-Dsbt.ivy.home=$LOCAL_CHIPYARD_DIR/.ivy2 -Dsbt.supershell=false -Dsbt.global.base=$LOCAL_CHIPYARD_DIR/.sbt -Dsbt.boot.directory=$LOCAL_CHIPYARD_DIR/.sbt/boot" CONFIG=GemminiRocketConfig diff --git a/.github/scripts/install-gemmini.sh b/.github/scripts/install-gemmini.sh index 8a2ef5cd..0fa6460d 100755 --- a/.github/scripts/install-gemmini.sh +++ b/.github/scripts/install-gemmini.sh @@ -20,7 +20,6 @@ rm -rf chipyard git clone --progress --verbose https://github.com/ucb-bar/chipyard.git chipyard cd $LOCAL_CHIPYARD_DIR -echo "Checking out Chipyard version: $(cat $LOCAL_CHECKOUT_DIR/CHIPYARD.hash)" git fetch git checkout $(cat $LOCAL_CHECKOUT_DIR/CHIPYARD.hash) @@ -29,7 +28,6 @@ git checkout $(cat $LOCAL_CHECKOUT_DIR/CHIPYARD.hash) source env.sh cd toolchains/esp-tools/riscv-isa-sim/build -echo "Checking out Spike version $(cat $LOCAL_CHECKOUT_DIR/SPIKE.hash)" git checkout $(cat $LOCAL_CHECKOUT_DIR/SPIKE.hash) make && make install diff --git a/.github/scripts/run-tests-spike.sh b/.github/scripts/run-tests-spike.sh index c6dbf850..93288a75 100755 --- a/.github/scripts/run-tests-spike.sh +++ b/.github/scripts/run-tests-spike.sh @@ -10,18 +10,7 @@ source $SCRIPT_DIR/enable-conda.sh cd $LOCAL_CHIPYARD_DIR source env.sh -cd $LOCAL_CHIPYARD_DIR/toolchains/esp-tools/riscv-isa-sim -echo Printing current spike commit -git log -1 --format="%H" - -cd $LOCAL_CHIPYARD_DIR/generators/gemmini -echo Printing current gemmini commit -git log -1 --format="%H" - cd $LOCAL_CHIPYARD_DIR/generators/gemmini/software/gemmini-rocc-tests -echo Printing current gemmini-rocc-tests commit -git log -1 --format="%H" - ./build.sh cd build From ee5746cc1875c74e5841cb0fc904ce5a3ec22049 Mon Sep 17 00:00:00 2001 From: Hasan Genc Date: Sat, 26 Nov 2022 12:18:34 -0800 Subject: [PATCH 14/64] update readme --- README.md | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 77d6466d..1e4f78bc 100644 --- a/README.md +++ b/README.md @@ -32,27 +32,28 @@ Run these steps to install Chipyard and Spike (make sure to checkout the correct ```shell git clone https://github.com/ucb-bar/chipyard.git cd chipyard -git checkout 481398b910fa95ec88dd578c67ba358a4d83129d -./scripts/init-submodules-no-riscv-tools.sh -./scripts/build-toolchains.sh esp-tools +git checkout 1.8.1 +./build-setup.sh esp-tools source env.sh cd generators/gemmini git config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*" -git fetch --unshallow && git checkout dev && git pull origin dev -git submodule update +git checkout dev && git pull origin dev +git submodule update --init --recursive + +SPIKE_HASH=$(cat SPIKE.hash) cd - cd toolchains/esp-tools/riscv-isa-sim/build -git fetch --unshallow && git checkout 2ed403a70f65559a3c2a06bf724d4737edc73a23 +git checkout $SPIKE_HASH make && make install # The final step is only necessary if you want to run MIDAS simulations with # realistic DRAM models cd - cd sims/firesim -git fetch --tags && git checkout 1.13.6 +source sourceme-f1-manager.sh --skip-ssh-setup # Ignore error messages from this command ./build-setup.sh --library --skip-validate ``` @@ -466,7 +467,7 @@ When calling `config_mvin` (described below), the programmer can choose which `m **Format:** `config_ex rs1 rs2` - `rs1[1:0]` must be `00` - `rs1[2]` determines if output (0) or weight (1) stationary -- `rs1[4:3]` = activation function: either relu (1), relu6 (2), or no activation function (0) +- `rs1[3]` = activation function: either relu (1) or no activation function (0) - `rs1[8]` = should A be transposed? - `rs1[9]` = should B be transposed? - `rs1[31:16]` = the stride (in scratchpad addresses) by which the rows of A are fed into the systolic array. @@ -477,8 +478,6 @@ If the stride is 2, then we feed every other row into the systolic array instead - In the default config, `rs1[63:32]` is of type `float32` - `rs2[31:0]` = the number of bits by which the accumulated result of a matmul is right-shifted when leaving the systolic array - This parameter is only relevant in output-stationary mode, when partial sums must be accumulated within the systolic array itself, and scaled-down when leaving the systolic array and being written into the scratchpad. -- `rs2[63:32]` = the number of bits by which 6 should be left-shifted before applying relu6 - - This parameter is ignored if the relu6 activation function is not being used. - `funct` = 0 **Action:** mode <= rs1(2); shift <= rs2; A_stride <= rs1[31:16] @@ -532,6 +531,12 @@ The parameters controlling this feature are: **Action:** stride <= rs2; max-pooling parameters <= rs1 +### `config_norm` configures normalization commands +**Format:** `config_norm rs1 rs2` + +`config_norm` is an **experimental** command added primarily to support an integer-only variant of BERT called [I-BERT](https://arxiv.org/abs/2101.01321) on Gemmini. +The command allows users to set scalar constants that are used by I-BERT's GELU, layernorm, softmax variants. + ### `flush` flushes the TLB **Format:** `flush rs1` - `rs1` = If `rs1[0]` is 1, then the current TLB request is skipped (if it has hit a page-fault and is waiting for an interrupt). From 2f85926d0d08f21aebd302163b0c6cafe43dc484 Mon Sep 17 00:00:00 2001 From: Hasan Genc Date: Sun, 27 Nov 2022 11:09:55 -0800 Subject: [PATCH 15/64] bump onnxruntime-riscv --- software/onnxruntime-riscv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/software/onnxruntime-riscv b/software/onnxruntime-riscv index daa8999f..f6d2fc95 160000 --- a/software/onnxruntime-riscv +++ b/software/onnxruntime-riscv @@ -1 +1 @@ -Subproject commit daa8999f80d5a233b6c478039f548751b7f02f38 +Subproject commit f6d2fc95463316ec47d7f832f35be03c26887922 From 0d863352425968d70b0454ce0236bcfd3372d16f Mon Sep 17 00:00:00 2001 From: Hasan Genc Date: Sun, 27 Nov 2022 11:16:53 -0800 Subject: [PATCH 16/64] update readme --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 1e4f78bc..20f136d3 100644 --- a/README.md +++ b/README.md @@ -22,7 +22,7 @@ We provide here a quick guide to installing Gemmini's dependencies (Chipyard and Dependencies --------- -Before beginning, install the [Chipyard dependencies](https://chipyard.readthedocs.io/en/latest/Chipyard-Basics/Initial-Repo-Setup.html#requirements) that are described here. +Before beginning, install the [Chipyard dependencies](https://chipyard.readthedocs.io/en/latest/Chipyard-Basics/Initial-Repo-Setup.html#default-requirements-installation). Installing Chipyard and Spike ----------------------------- @@ -144,7 +144,7 @@ cd chipyard/generators/gemmini Next steps -------- -Check out [our IISWC 2021 tutorial](https://sites.google.com/berkeley.edu/gemminitutorialiiswc2021/) to learn how to: +Check out our [MLSys 2022 tutorial](https://sites.google.com/berkeley.edu/gemmini-tutorial-mlsys-2022) (or our earlier but more out-of-date [IISWC 2021 tutorial](https://sites.google.com/berkeley.edu/gemminitutorialiiswc2021/)) to learn how to: * build different types of diverse accelerators using Gemmini. * add custom datatypes to Gemmini. * write your own Gemmini programs. @@ -535,7 +535,7 @@ The parameters controlling this feature are: **Format:** `config_norm rs1 rs2` `config_norm` is an **experimental** command added primarily to support an integer-only variant of BERT called [I-BERT](https://arxiv.org/abs/2101.01321) on Gemmini. -The command allows users to set scalar constants that are used by I-BERT's GELU, layernorm, softmax variants. +The command allows users to set scalar constants that are used by I-BERT's GELU, layernorm, and softmax variants. ### `flush` flushes the TLB **Format:** `flush rs1` From d3a58f4c978c3a5a84ef9950fa9d4339395d37fe Mon Sep 17 00:00:00 2001 From: gnodipac886 <44887166+gnodipac886@users.noreply.github.com> Date: Thu, 29 Dec 2022 22:21:43 -0500 Subject: [PATCH 17/64] reduce PE area by using a single MAC unit per PE (#265) PE was not synthesizing properly on the FPGA since it was synthesizing multiple mac units in a single PE. I added a new mac unit class and instantiate a single mac unit per PE to reduce area over head. This addresses #262 Co-authored-by: Hasan Genc Co-authored-by: Eric Dong --- README.md | 8 +------ src/main/scala/gemmini/PE.scala | 40 +++++++++++++++++++++++++++++---- 2 files changed, 37 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index e96c9177..5f310564 100644 --- a/README.md +++ b/README.md @@ -2,12 +2,6 @@

-Upcoming Tutorial -=============================== -We will be presenting [a new tutorial](https://sites.google.com/berkeley.edu/gemmini-tutorial-mlsys-2022) for Gemmini at MLSys 2022, on August 29th, 2022. - -If you would like to attend, **then please register online** [at this link](https://docs.google.com/forms/d/1bdIXegBkEMJY88YuD80HN40haZ9tx_bZgmaN3FON5DI/edit). We're looking forward to meeting you all! - Gemmini ==================================== @@ -52,7 +46,7 @@ SPIKE_HASH=$(cat SPIKE.hash) cd - cd toolchains/esp-tools/riscv-isa-sim/build -git checkout $SPIKE_HASH +git fetch && git checkout $SPIKE_HASH make && make install # The final step is only necessary if you want to run MIDAS simulations with diff --git a/src/main/scala/gemmini/PE.scala b/src/main/scala/gemmini/PE.scala index 5f7205bd..9518942f 100644 --- a/src/main/scala/gemmini/PE.scala +++ b/src/main/scala/gemmini/PE.scala @@ -11,6 +11,18 @@ class PEControl[T <: Data : Arithmetic](accType: T) extends Bundle { } +class MacUnit[T <: Data](inputType: T, cType: T, dType: T) (implicit ev: Arithmetic[T]) extends Module { + import ev._ + val io = IO(new Bundle { + val in_a = Input(inputType) + val in_b = Input(inputType) + val in_c = Input(cType) + val out_d = Output(dType) + }) + + io.out_d := io.in_c.mac(io.in_a, io.in_b) +} + // TODO update documentation /** * A PE implementing a MAC operation. Configured as fully combinational when integrated into a Mesh. @@ -45,6 +57,12 @@ class PE[T <: Data](inputType: T, outputType: T, accType: T, df: Dataflow.Value, val cType = if (df == Dataflow.WS) inputType else accType + // When creating PEs that support multiple dataflows, the + // elaboration/synthesis tools often fail to consolidate and de-duplicate + // MAC units. To force mac circuitry to be re-used, we create a "mac_unit" + // module here which just performs a single MAC operation + val mac_unit = Module(new MacUnit(inputType, cType, outputType)) + val a = io.in_a val b = io.in_b val d = io.in_d @@ -65,6 +83,8 @@ class PE[T <: Data](inputType: T, outputType: T, accType: T, df: Dataflow.Value, io.out_last := last io.out_valid := valid + mac_unit.io.in_a := a + val last_s = RegEnable(prop, valid) val flip = last_s =/= prop val shift_offset = Mux(flip, shift, 0.U) @@ -82,22 +102,30 @@ class PE[T <: Data](inputType: T, outputType: T, accType: T, df: Dataflow.Value, when(prop === PROPAGATE) { io.out_c := (c1 >> shift_offset).clippedToWidthOf(outputType) io.out_b := b - c2 := c2.mac(a, b.asTypeOf(inputType)) + mac_unit.io.in_b := b.asTypeOf(inputType) + mac_unit.io.in_c := c2 + c2 := mac_unit.io.out_d c1 := d.withWidthOf(cType) }.otherwise { io.out_c := (c2 >> shift_offset).clippedToWidthOf(outputType) io.out_b := b - c1 := c1.mac(a, b.asTypeOf(inputType)) + mac_unit.io.in_b := b.asTypeOf(inputType) + mac_unit.io.in_c := c1 + c1 := mac_unit.io.out_d c2 := d.withWidthOf(cType) } }.elsewhen ((df == Dataflow.WS).B || ((df == Dataflow.BOTH).B && dataflow === WEIGHT_STATIONARY)) { when(prop === PROPAGATE) { io.out_c := c1 - io.out_b := b.mac(a, c2.asTypeOf(inputType)) + mac_unit.io.in_b := c2.asTypeOf(inputType) + mac_unit.io.in_c := b + io.out_b := mac_unit.io.out_d c1 := d }.otherwise { io.out_c := c2 - io.out_b := b.mac(a, c1.asTypeOf(inputType)) + mac_unit.io.in_b := c1.asTypeOf(inputType) + mac_unit.io.in_c := b + io.out_b := mac_unit.io.out_d c2 := d } }.otherwise { @@ -105,10 +133,14 @@ class PE[T <: Data](inputType: T, outputType: T, accType: T, df: Dataflow.Value, //assert(false.B, "unknown dataflow") io.out_c := DontCare io.out_b := DontCare + mac_unit.io.in_b := b.asTypeOf(inputType) + mac_unit.io.in_c := c2 } when (!valid) { c1 := c1 c2 := c2 + mac_unit.io.in_b := DontCare + mac_unit.io.in_c := DontCare } } From 92972df936bc02608f29c1eb75bd4d146e4d4932 Mon Sep 17 00:00:00 2001 From: SingularityKChen Date: Fri, 30 Dec 2022 11:26:18 +0800 Subject: [PATCH 18/64] fix: chisel3 pr #2758 (#269) Remove parenthesized forms of asUInt(), asBool(), asSInt(), orR(), andR(), zext() Co-authored-by: Abraham Gonzalez Co-authored-by: Hasan Genc Co-authored-by: joey0320 --- src/main/scala/gemmini/AccumulatorMem.scala | 6 +- src/main/scala/gemmini/AccumulatorScale.scala | 6 +- src/main/scala/gemmini/Arithmetic.scala | 28 ++-- src/main/scala/gemmini/BeatMerger.scala | 16 +-- src/main/scala/gemmini/DMA.scala | 20 +-- .../scala/gemmini/DMACommandTracker.scala | 2 +- src/main/scala/gemmini/DSEConfigs.scala | 6 +- .../scala/gemmini/ExecuteController.scala | 8 +- src/main/scala/gemmini/FrontendTLB.scala | 2 +- src/main/scala/gemmini/Im2Col.scala | 4 +- .../gemmini/InstructionCompression.scala | 4 +- src/main/scala/gemmini/LocalAddr.scala | 4 +- src/main/scala/gemmini/LoopConv.scala | 126 +++++++++--------- src/main/scala/gemmini/LoopMatmul.scala | 56 ++++---- src/main/scala/gemmini/LoopUnroller.scala | 4 +- src/main/scala/gemmini/MeshWithDelays.scala | 2 +- src/main/scala/gemmini/Normalizer.scala | 10 +- src/main/scala/gemmini/PixelRepeater.scala | 6 +- .../scala/gemmini/ReservationStation.scala | 2 +- src/main/scala/gemmini/Scratchpad.scala | 6 +- src/main/scala/gemmini/StoreController.scala | 4 +- src/main/scala/gemmini/TagQueue.scala | 2 +- src/main/scala/gemmini/TilerScheduler.scala | 4 +- src/main/scala/gemmini/Util.scala | 12 +- src/main/scala/gemmini/XactTracker.scala | 2 +- src/main/scala/gemmini/ZeroWriter.scala | 2 +- 26 files changed, 172 insertions(+), 172 deletions(-) diff --git a/src/main/scala/gemmini/AccumulatorMem.scala b/src/main/scala/gemmini/AccumulatorMem.scala index dd5ed821..c664bd0f 100644 --- a/src/main/scala/gemmini/AccumulatorMem.scala +++ b/src/main/scala/gemmini/AccumulatorMem.scala @@ -230,7 +230,7 @@ class AccumulatorMem[T <: Data, U <: Data]( val wmask = Mux1H(w_q_head.asBools, w_q.map(_.mask)) val waddr = Mux1H(w_q_head.asBools, w_q.map(_.addr)) when (wen) { - w_q_head := (w_q_head << 1).asUInt() | w_q_head(nEntries-1) + w_q_head := (w_q_head << 1).asUInt | w_q_head(nEntries-1) for (i <- 0 until nEntries) { when (w_q_head(i)) { w_q(i).valid := false.B @@ -243,7 +243,7 @@ class AccumulatorMem[T <: Data, U <: Data]( when (w_q_push) { assert(!w_q_full || wen, "we ran out of acc-sub-bank write q entries") - w_q_tail := (w_q_tail << 1).asUInt() | w_q_tail(nEntries-1) + w_q_tail := (w_q_tail << 1).asUInt | w_q_tail(nEntries-1) for (i <- 0 until nEntries) { when (w_q_tail(i)) { w_q(i).valid := true.B @@ -334,7 +334,7 @@ class AccumulatorMem[T <: Data, U <: Data]( io.write.ready := !block_write_req && !pipelined_writes.map(r => r.valid && r.bits.addr === io.write.bits.addr && io.write.bits.acc).reduce(_||_) - when (reset.asBool()) { + when (reset.asBool) { pipelined_writes.foreach(_.valid := false.B) } diff --git a/src/main/scala/gemmini/AccumulatorScale.scala b/src/main/scala/gemmini/AccumulatorScale.scala index 1fdd15fa..bf98a144 100644 --- a/src/main/scala/gemmini/AccumulatorScale.scala +++ b/src/main/scala/gemmini/AccumulatorScale.scala @@ -175,7 +175,7 @@ class AccumulatorScale[T <: Data, U <: Data]( completed_masks(i).foreach(_ := false.B) } } - tail_oh := (tail_oh << 1).asUInt() | tail_oh(nEntries-1) + tail_oh := (tail_oh << 1).asUInt | tail_oh(nEntries-1) } val inputs = Seq.fill(width*nEntries) { Wire(Decoupled(new AccScaleDataWithIndex(t, scale_t)(ev))) } @@ -282,10 +282,10 @@ object AccumulatorScale { // qln2_inv / S / (2 ** 16) = 1 / ln2 // q * qln2_inv = x / S / ln2 * S * (2 ** 16) = x / ln2 * (2 ** 16) val neg_q_iexp = neg(q) - val z_iexp = (neg_q_iexp * qln2_inv).asUInt().do_>>(16).asTypeOf(q) // q is non-positive + val z_iexp = (neg_q_iexp * qln2_inv).asUInt.do_>>(16).asTypeOf(q) // q is non-positive val qp_iexp = q.mac(z_iexp, qln2).withWidthOf(q) val q_poly_iexp = qc.mac(qp_iexp + qb, qp_iexp + qb).withWidthOf(q) // we dont want a rounding shift - (q_poly_iexp.asUInt().do_>>(z_iexp.asUInt()(5, 0))).asTypeOf(q) + (q_poly_iexp.asUInt.do_>>(z_iexp.asUInt(5, 0))).asTypeOf(q) }} diff --git a/src/main/scala/gemmini/Arithmetic.scala b/src/main/scala/gemmini/Arithmetic.scala index cdd36396..c6792578 100644 --- a/src/main/scala/gemmini/Arithmetic.scala +++ b/src/main/scala/gemmini/Arithmetic.scala @@ -62,12 +62,12 @@ object Arithmetic { // TODO Do we need to explicitly handle the cases where "u" is a small number (like 0)? What is the default behavior here? val point_five = Mux(u === 0.U, 0.U, self(u - 1.U)) - val zeros = Mux(u <= 1.U, 0.U, self.asUInt() & ((1.U << (u - 1.U)).asUInt() - 1.U)) =/= 0.U + val zeros = Mux(u <= 1.U, 0.U, self.asUInt & ((1.U << (u - 1.U)).asUInt - 1.U)) =/= 0.U val ones_digit = self(u) val r = point_five & (zeros | ones_digit) - (self >> u).asUInt() + r + (self >> u).asUInt + r } override def >(t: UInt): Bool = self > t @@ -99,19 +99,19 @@ object Arithmetic { // TODO Do we need to explicitly handle the cases where "u" is a small number (like 0)? What is the default behavior here? val point_five = Mux(u === 0.U, 0.U, self(u - 1.U)) - val zeros = Mux(u <= 1.U, 0.U, self.asUInt() & ((1.U << (u - 1.U)).asUInt() - 1.U)) =/= 0.U + val zeros = Mux(u <= 1.U, 0.U, self.asUInt & ((1.U << (u - 1.U)).asUInt - 1.U)) =/= 0.U val ones_digit = self(u) - val r = (point_five & (zeros | ones_digit)).asBool() + val r = (point_five & (zeros | ones_digit)).asBool - (self >> u).asSInt() + Mux(r, 1.S, 0.S) + (self >> u).asSInt + Mux(r, 1.S, 0.S) } override def >(t: SInt): Bool = self > t override def withWidthOf(t: SInt) = { if (self.getWidth >= t.getWidth) - self(t.getWidth-1, 0).asSInt() + self(t.getWidth-1, 0).asSInt else { val sign_bits = t.getWidth - self.getWidth val sign = self(self.getWidth-1) @@ -122,7 +122,7 @@ object Arithmetic { override def clippedToWidthOf(t: SInt): SInt = { val maxsat = ((1 << (t.getWidth-1))-1).S val minsat = (-(1 << (t.getWidth-1))).S - MuxCase(self, Seq((self > maxsat) -> maxsat, (self < minsat) -> minsat))(t.getWidth-1, 0).asSInt() + MuxCase(self, Seq((self > maxsat) -> maxsat, (self < minsat) -> minsat))(t.getWidth-1, 0).asSInt } override def relu: SInt = Mux(self >= 0.S, self, 0.S) @@ -144,7 +144,7 @@ object Arithmetic { def sin_to_float(x: SInt) = { val in_to_rec_fn = Module(new INToRecFN(intWidth = self.getWidth, expWidth, sigWidth)) in_to_rec_fn.io.signedIn := true.B - in_to_rec_fn.io.in := x.asUInt() + in_to_rec_fn.io.in := x.asUInt in_to_rec_fn.io.roundingMode := consts.round_minMag // consts.round_near_maxMag in_to_rec_fn.io.detectTininess := consts.tininess_afterRounding @@ -167,7 +167,7 @@ object Arithmetic { rec_fn_to_in.io.in := x rec_fn_to_in.io.roundingMode := consts.round_minMag // consts.round_near_maxMag - rec_fn_to_in.io.out.asSInt() + rec_fn_to_in.io.out.asSInt } val self_rec = sin_to_float(self) @@ -207,7 +207,7 @@ object Arithmetic { def in_to_float(x: SInt) = { val in_to_rec_fn = Module(new INToRecFN(intWidth = self.getWidth, expWidth, sigWidth)) in_to_rec_fn.io.signedIn := true.B - in_to_rec_fn.io.in := x.asUInt() + in_to_rec_fn.io.in := x.asUInt in_to_rec_fn.io.roundingMode := consts.round_minMag // consts.round_near_maxMag in_to_rec_fn.io.detectTininess := consts.tininess_afterRounding @@ -220,7 +220,7 @@ object Arithmetic { rec_fn_to_in.io.in := x rec_fn_to_in.io.roundingMode := consts.round_minMag // consts.round_near_maxMag - rec_fn_to_in.io.out.asSInt() + rec_fn_to_in.io.out.asSInt } val self_rec = in_to_float(self) @@ -255,7 +255,7 @@ object Arithmetic { def in_to_float(x: SInt) = { val in_to_rec_fn = Module(new INToRecFN(intWidth = self.getWidth, expWidth, sigWidth)) in_to_rec_fn.io.signedIn := true.B - in_to_rec_fn.io.in := x.asUInt() + in_to_rec_fn.io.in := x.asUInt in_to_rec_fn.io.roundingMode := consts.round_near_even // consts.round_near_maxMag in_to_rec_fn.io.detectTininess := consts.tininess_afterRounding @@ -291,7 +291,7 @@ object Arithmetic { def in_to_float(x: SInt) = { val in_to_rec_fn = Module(new INToRecFN(intWidth = self.getWidth, expWidth, sigWidth)) in_to_rec_fn.io.signedIn := true.B - in_to_rec_fn.io.in := x.asUInt() + in_to_rec_fn.io.in := x.asUInt in_to_rec_fn.io.roundingMode := consts.round_near_even // consts.round_near_maxMag in_to_rec_fn.io.detectTininess := consts.tininess_afterRounding @@ -304,7 +304,7 @@ object Arithmetic { rec_fn_to_in.io.in := x rec_fn_to_in.io.roundingMode := consts.round_minMag - rec_fn_to_in.io.out.asSInt() + rec_fn_to_in.io.out.asSInt } val self_rec = in_to_float(self) diff --git a/src/main/scala/gemmini/BeatMerger.scala b/src/main/scala/gemmini/BeatMerger.scala index a6a67dab..e8f22b2a 100644 --- a/src/main/scala/gemmini/BeatMerger.scala +++ b/src/main/scala/gemmini/BeatMerger.scala @@ -59,11 +59,11 @@ class BeatMerger[U <: Data](beatBits: Int, maxShift: Int, spadWidth: Int, accWid } val last_sending = bytesSent_next === req.bits.bytes_to_read - val last_reading = beatBytes.U >= (1.U << req.bits.lg_len_req).asUInt() - bytesRead + val last_reading = beatBytes.U >= (1.U << req.bits.lg_len_req).asUInt - bytesRead io.req.ready := !req.valid - io.in.ready := io.req.fire || (req.valid && bytesRead =/= (1.U << req.bits.lg_len_req).asUInt()) + io.in.ready := io.req.fire || (req.valid && bytesRead =/= (1.U << req.bits.lg_len_req).asUInt) io.out.valid := req.valid && usefulBytesRead > bytesSent && (usefulBytesRead - bytesSent >= rowBytes || usefulBytesRead === req.bits.bytes_to_read) @@ -90,7 +90,7 @@ class BeatMerger[U <: Data](beatBits: Int, maxShift: Int, spadWidth: Int, accWid io.out.bits.accumulate := req.bits.accumulate io.out.bits.has_acc_bitwidth := req.bits.has_acc_bitwidth - when (bytesRead === (1.U << req.bits.lg_len_req).asUInt() && + when (bytesRead === (1.U << req.bits.lg_len_req).asUInt && bytesSent === req.bits.bytes_to_read) { req.pop() } @@ -98,7 +98,7 @@ class BeatMerger[U <: Data](beatBits: Int, maxShift: Int, spadWidth: Int, accWid when (io.out.fire) { bytesSent := bytesSent_next - when (last_sending && bytesRead === (1.U << req.bits.lg_len_req).asUInt()) { + when (last_sending && bytesRead === (1.U << req.bits.lg_len_req).asUInt) { req.pop() io.req.ready := true.B } @@ -116,16 +116,16 @@ class BeatMerger[U <: Data](beatBits: Int, maxShift: Int, spadWidth: Int, accWid val current_usefulBytesRead = Mux(io.req.fire, 0.U, usefulBytesRead) val current_shift = Mux(io.req.fire, io.req.bits.shift, req.bits.shift) val current_lg_len_req = Mux(io.req.fire, io.req.bits.lg_len_req, req.bits.lg_len_req) - val current_len_req = (1.U << current_lg_len_req).asUInt() + val current_len_req = (1.U << current_lg_len_req).asUInt when (current_shift - current_bytesDiscarded <= beatBytes.U /* && current_bytesRead < current_len_req */ ) { val rshift = (current_shift - current_bytesDiscarded) * 8.U // in bits val lshift = current_usefulBytesRead * 8.U // in bits - val mask = (~(((~0.U(beatBits.W)) >> rshift) << lshift)).asUInt() + val mask = (~(((~0.U(beatBits.W)) >> rshift) << lshift)).asUInt - buffer := (buffer & mask) | ((io.in.bits >> rshift) << lshift).asUInt() + buffer := (buffer & mask) | ((io.in.bits >> rshift) << lshift).asUInt } bytesRead := satAdd(current_bytesRead, beatBytes.U, current_len_req) @@ -135,7 +135,7 @@ class BeatMerger[U <: Data](beatBits: Int, maxShift: Int, spadWidth: Int, accWid } } - when (reset.asBool()) { + when (reset.asBool) { req.valid := false.B } } diff --git a/src/main/scala/gemmini/DMA.scala b/src/main/scala/gemmini/DMA.scala index 71148b67..729e17c7 100644 --- a/src/main/scala/gemmini/DMA.scala +++ b/src/main/scala/gemmini/DMA.scala @@ -380,7 +380,7 @@ class StreamWriter[T <: Data: Arithmetic](nXacts: Int, beatBits: Int, maxBytes: // TODO use the same register to hold data_blocks and data_single_block, so that this Mux here is not necessary val data_blocks = Reg(Vec(maxBlocks, UInt((inputTypeRowBytes * 8).W))) val data_single_block = Reg(UInt(dataWidth.W)) // For data that's just one-block-wide - val data = Mux(req.block === 0.U, data_single_block, data_blocks.asUInt()) + val data = Mux(req.block === 0.U, data_single_block, data_blocks.asUInt) val bytesSent = Reg(UInt(log2Ceil((dataBytes max maxBytes)+1).W)) // TODO this only needs to count up to (dataBytes/aligned_to), right? val bytesLeft = req.len - bytesSent @@ -390,9 +390,9 @@ class StreamWriter[T <: Data: Arithmetic](nXacts: Int, beatBits: Int, maxBytes: val xactId = OHToUInt(xactOnehot) val xactBusy_fire = WireInit(false.B) - val xactBusy_add = Mux(xactBusy_fire, (1.U << xactId).asUInt(), 0.U) - val xactBusy_remove = ~Mux(tl.d.fire, (1.U << tl.d.bits.source).asUInt(), 0.U) - xactBusy := (xactBusy | xactBusy_add) & xactBusy_remove.asUInt() + val xactBusy_add = Mux(xactBusy_fire, (1.U << xactId).asUInt, 0.U) + val xactBusy_remove = ~Mux(tl.d.fire, (1.U << tl.d.bits.source).asUInt, 0.U) + xactBusy := (xactBusy | xactBusy_add) & xactBusy_remove.asUInt val state_machine_ready_for_req = WireInit(state === s_idle) io.req.ready := state_machine_ready_for_req @@ -482,15 +482,15 @@ class StreamWriter[T <: Data: Arithmetic](nXacts: Int, beatBits: Int, maxBytes: fromSource = RegEnableThru(xactId, state === s_writing_new_block), toAddress = 0.U, lgSize = lg_write_size, - data = (data >> (bytesSent * 8.U)).asUInt() + data = (data >> (bytesSent * 8.U)).asUInt )._2 val putPartial = edge.Put( fromSource = RegEnableThru(xactId, state === s_writing_new_block), toAddress = 0.U, lgSize = lg_write_size, - data = ((data >> (bytesSent * 8.U)) << (write_shift * 8.U)).asUInt(), - mask = write_mask.asUInt() + data = ((data >> (bytesSent * 8.U)) << (write_shift * 8.U)).asUInt, + mask = write_mask.asUInt )._2 class TLBundleAWithInfo extends Bundle { @@ -501,7 +501,7 @@ class StreamWriter[T <: Data: Arithmetic](nXacts: Int, beatBits: Int, maxBytes: val untranslated_a = Wire(Decoupled(new TLBundleAWithInfo)) xactBusy_fire := untranslated_a.fire && state === s_writing_new_block - untranslated_a.valid := (state === s_writing_new_block || state === s_writing_beats) && !xactBusy.andR() + untranslated_a.valid := (state === s_writing_new_block || state === s_writing_beats) && !xactBusy.andR untranslated_a.bits.tl_a := Mux(write_full, putFull, putPartial) untranslated_a.bits.vaddr := write_vaddr untranslated_a.bits.status := req.status @@ -543,7 +543,7 @@ class StreamWriter[T <: Data: Arithmetic](nXacts: Int, beatBits: Int, maxBytes: tl.a.bits := translate_q.io.deq.bits.tl_a tl.a.bits.address := RegEnableThru(io.tlb.resp.paddr, RegNext(io.tlb.req.fire)) - tl.d.ready := xactBusy.orR() + tl.d.ready := xactBusy.orR when (untranslated_a.fire) { when (state === s_writing_new_block) { @@ -588,7 +588,7 @@ class StreamWriter[T <: Data: Arithmetic](nXacts: Int, beatBits: Int, maxBytes: val v1 = io.req.bits.data.asTypeOf(Vec(cols, inputType)) val v2 = data_single_block.asTypeOf(Vec(cols, inputType)) val m = v1.zip(v2) - VecInit(m.zipWithIndex.map{case ((x, y), i) => if (i < block_cols) maxOf(x, y) else y}).asUInt() + VecInit(m.zipWithIndex.map{case ((x, y), i) => if (i < block_cols) maxOf(x, y) else y}).asUInt } req := io.req.bits diff --git a/src/main/scala/gemmini/DMACommandTracker.scala b/src/main/scala/gemmini/DMACommandTracker.scala index 9d4f71e6..a2b5df32 100644 --- a/src/main/scala/gemmini/DMACommandTracker.scala +++ b/src/main/scala/gemmini/DMACommandTracker.scala @@ -93,7 +93,7 @@ class DMACommandTracker[T <: Data](val nCmds: Int, val maxBytes: Int, tag_t: => cmds(io.cmd_completed.bits.cmd_id).valid := false.B } - when (reset.asBool()) { + when (reset.asBool) { cmds.foreach(_.init()) } } diff --git a/src/main/scala/gemmini/DSEConfigs.scala b/src/main/scala/gemmini/DSEConfigs.scala index 3ed92c7c..257721ca 100644 --- a/src/main/scala/gemmini/DSEConfigs.scala +++ b/src/main/scala/gemmini/DSEConfigs.scala @@ -51,12 +51,12 @@ object DSEBaseConfig { // TODO Do we need to explicitly handle the cases where "u" is a small number (like 0)? What is the default behavior here? val point_five = Mux(u === 0.U, 0.U, t(u - 1.U)) - val zeros = Mux(u <= 1.U, 0.U, t.asUInt() & ((1.U << (u - 1.U)).asUInt() - 1.U)) =/= 0.U + val zeros = Mux(u <= 1.U, 0.U, t.asUInt & ((1.U << (u - 1.U)).asUInt - 1.U)) =/= 0.U val ones_digit = t(u) - val r = (point_five & (zeros | ones_digit)).asBool() + val r = (point_five & (zeros | ones_digit)).asBool - (t >> u).asSInt() + Mux(r, 1.S, 0.S) + (t >> u).asSInt + Mux(r, 1.S, 0.S) }, 0, UInt(8.W), -1)), acc_read_full_width = true, acc_read_small_width = true, diff --git a/src/main/scala/gemmini/ExecuteController.scala b/src/main/scala/gemmini/ExecuteController.scala index 2ef7fa3f..65add720 100644 --- a/src/main/scala/gemmini/ExecuteController.scala +++ b/src/main/scala/gemmini/ExecuteController.scala @@ -801,8 +801,8 @@ class ExecuteController[T <: Data, U <: Data, V <: Data](xLen: Int, tagWidth: In mesh_cntl_signals_q.io.enq.bits.first := !a_fire_started && !b_fire_started && !d_fire_started val readData = VecInit(io.srams.read.map(_.resp.bits.data)) - val accReadData = if (ex_read_from_acc) VecInit(io.acc.read_resp.map(_.bits.data.asUInt())) else readData - val im2ColData = io.im2col.resp.bits.a_im2col.asUInt() + val accReadData = if (ex_read_from_acc) VecInit(io.acc.read_resp.map(_.bits.data.asUInt)) else readData + val im2ColData = io.im2col.resp.bits.a_im2col.asUInt val readValid = VecInit(io.srams.read.map(bank => ex_read_from_spad.B && bank.resp.valid && !bank.resp.bits.fromDMA)) val accReadValid = VecInit(io.acc.read_resp.map(bank => ex_read_from_acc.B && bank.valid && !bank.bits.fromDMA)) @@ -933,7 +933,7 @@ class ExecuteController[T <: Data, U <: Data, V <: Data](xLen: Int, tagWidth: In if (ex_write_to_spad) { io.srams.write(i).en := start_array_outputting && w_bank === i.U && !write_to_acc && !is_garbage_addr && write_this_row io.srams.write(i).addr := w_row - io.srams.write(i).data := activated_wdata.asUInt() + io.srams.write(i).data := activated_wdata.asUInt io.srams.write(i).mask := w_mask.flatMap(b => Seq.fill(inputType.getWidth / (aligned_to * 8))(b)) } else { io.srams.write(i).en := false.B @@ -993,7 +993,7 @@ class ExecuteController[T <: Data, U <: Data, V <: Data](xLen: Int, tagWidth: In complete_bits_count := complete_bits_count + 1.U } - when (reset.asBool()) { + when (reset.asBool) { // pending_completed_rob_id.valid := false.B pending_completed_rob_ids.foreach(_.valid := false.B) } diff --git a/src/main/scala/gemmini/FrontendTLB.scala b/src/main/scala/gemmini/FrontendTLB.scala index 6e7168e9..63a41496 100644 --- a/src/main/scala/gemmini/FrontendTLB.scala +++ b/src/main/scala/gemmini/FrontendTLB.scala @@ -119,7 +119,7 @@ class FrontendTLB(nClients: Int, entries: Int, maxSize: Int, use_tlb_register_fi val last_translated_vpn = RegInit(0.U(vaddrBits.W)) val last_translated_ppn = RegInit(0.U(paddrBits.W)) - val l0_tlb_hit = last_translated_valid && ((client.req.bits.tlb_req.vaddr >> pgIdxBits).asUInt() === (last_translated_vpn >> pgIdxBits).asUInt()) + val l0_tlb_hit = last_translated_valid && ((client.req.bits.tlb_req.vaddr >> pgIdxBits).asUInt === (last_translated_vpn >> pgIdxBits).asUInt) val l0_tlb_paddr = Cat(last_translated_ppn >> pgIdxBits, client.req.bits.tlb_req.vaddr(pgIdxBits-1,0)) val tlb = if (use_shared_tlb) tlbs.head else tlbs(i) diff --git a/src/main/scala/gemmini/Im2Col.scala b/src/main/scala/gemmini/Im2Col.scala index a317902b..65c4dbd4 100644 --- a/src/main/scala/gemmini/Im2Col.scala +++ b/src/main/scala/gemmini/Im2Col.scala @@ -90,12 +90,12 @@ class Im2Col[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T, U, V //how much horizonal turn we have to compute (input_channel*kernel_dim/16) - //val turn = Mux(im2col_width(3,0) === 0.U, (im2col_width >> (log2Up(block_size)).U).asUInt(), (im2col_width >> (log2Up(block_size)).U).asUInt + 1.U) + //val turn = Mux(im2col_width(3,0) === 0.U, (im2col_width >> (log2Up(block_size)).U).asUInt, (im2col_width >> (log2Up(block_size)).U).asUInt + 1.U) val turn = filter_dim2//Mux(channel(3,0) === 0.U, filter_dim2*channel(6, 4), filter_dim2*channel(6, 4) + 1.U) //Seah: added for more than 16 rows of output //how much vertical turn we have to compute (output_dim/16) - //val row_turn = Mux(output_dim(3,0) === 0.U, (output_dim >> (log2Up(block_size)).U).asUInt - 1.U, (output_dim >> (log2Up(block_size)).U).asUInt()) //im2col height + //val row_turn = Mux(output_dim(3,0) === 0.U, (output_dim >> (log2Up(block_size)).U).asUInt - 1.U, (output_dim >> (log2Up(block_size)).U).asUInt) //im2col height val row_turn = io.req.bits.row_turn val row_left = io.req.bits.row_left diff --git a/src/main/scala/gemmini/InstructionCompression.scala b/src/main/scala/gemmini/InstructionCompression.scala index 96bc77ee..fe6cd3d9 100644 --- a/src/main/scala/gemmini/InstructionCompression.scala +++ b/src/main/scala/gemmini/InstructionCompression.scala @@ -40,7 +40,7 @@ class InstCompressor(implicit p: Parameters) extends Module { buf(waddr).push(io.in.bits) } - when (reset.asBool()) { + when (reset.asBool) { buf.foreach(_.valid := false.B) } } @@ -79,7 +79,7 @@ class InstDecompressor(rob_entries: Int)(implicit p: Parameters) extends Module pushed_preload := false.B } - when (reset.asBool()) { + when (reset.asBool) { buf.valid := false.B } } diff --git a/src/main/scala/gemmini/LocalAddr.scala b/src/main/scala/gemmini/LocalAddr.scala index b53addea..fdaeffe6 100644 --- a/src/main/scala/gemmini/LocalAddr.scala +++ b/src/main/scala/gemmini/LocalAddr.scala @@ -40,8 +40,8 @@ class LocalAddr(sp_banks: Int, sp_bank_entries: Int, acc_banks: Int, acc_bank_en def is_same_address(other: LocalAddr): Bool = is_acc_addr === other.is_acc_addr && data === other.data def is_same_address(other: UInt): Bool = is_same_address(other.asTypeOf(this)) - def is_garbage(dummy: Int = 0) = is_acc_addr && accumulate && read_full_acc_row && data.andR() && - (if (garbage_bit.getWidth > 0) garbage_bit.asBool() else true.B) + def is_garbage(dummy: Int = 0) = is_acc_addr && accumulate && read_full_acc_row && data.andR && + (if (garbage_bit.getWidth > 0) garbage_bit.asBool else true.B) def +(other: UInt) = { require(isPow2(sp_bank_entries)) // TODO remove this requirement diff --git a/src/main/scala/gemmini/LoopConv.scala b/src/main/scala/gemmini/LoopConv.scala index 210bcade..53032a51 100644 --- a/src/main/scala/gemmini/LoopConv.scala +++ b/src/main/scala/gemmini/LoopConv.scala @@ -174,10 +174,10 @@ class LoopConvLdBias(block_size: Int, coreMaxAddrBits: Int, large_iterator_bitwi io.cmd.bits.rs1 := o.dram_addr val mvin_cmd_rs2 = Wire(mvin_rs2_t.cloneType) mvin_cmd_rs2 := DontCare - mvin_cmd_rs2.num_rows := o.I.asUInt() - mvin_cmd_rs2.num_cols := o.J.asUInt() + mvin_cmd_rs2.num_rows := o.I.asUInt + mvin_cmd_rs2.num_cols := o.J.asUInt mvin_cmd_rs2.local_addr := cast_to_acc_addr(mvin_cmd_rs2.local_addr, o.spad_addr, accumulate = false.B, read_full = false.B) - io.cmd.bits.rs2 := mvin_cmd_rs2.asUInt() + io.cmd.bits.rs2 := mvin_cmd_rs2.asUInt } // Sending outputs @@ -257,8 +257,8 @@ class LoopConvLdInput(block_size: Int, coreMaxAddrBits: Int, large_iterator_bitw def undilated(x: UInt): UInt = (x +& req.input_dilated) >> req.input_dilated // Derived parameters - val max_ichs_per_mvin = Mux(ichs < (max_block_len * block_size).U, ichs, (max_block_len * block_size).U).zext() - val max_batches_per_mvin = Mux(batches < (max_block_len * block_size).U, batches, (max_block_len * block_size).U).zext() + val max_ichs_per_mvin = Mux(ichs < (max_block_len * block_size).U, ichs, (max_block_len * block_size).U).zext + val max_batches_per_mvin = Mux(batches < (max_block_len * block_size).U, batches, (max_block_len * block_size).U).zext val max_chs_per_mvin = Mux(req.trans_input_3120, max_batches_per_mvin, max_ichs_per_mvin) // Iterators @@ -268,34 +268,34 @@ class LoopConvLdInput(block_size: Int, coreMaxAddrBits: Int, large_iterator_bitw val ich = Reg(SInt(large_iterator_bitwidth.W)) // Calculated params - val irow_padded = irow +& undilated(upad).zext() - val icol_padded = icol +& undilated(lpad).zext() - val is_zeros = irow < 0.S || irow >= irows_unpadded.zext() || icol < 0.S || icol >= icols_unpadded.zext() + val irow_padded = irow +& undilated(upad).zext + val icol_padded = icol +& undilated(lpad).zext + val is_zeros = irow < 0.S || irow >= irows_unpadded.zext || icol < 0.S || icol >= icols_unpadded.zext val dram_stride = Mux(req.trans_input_3120, batch_size * (input_w/8).U, in_channels * (input_w/8).U) // Addresses - val dram_offset = Mux(req.trans_input_3120, (((ich * in_dim * in_dim +& irow*in_dim +& icol) * batches +& b) * (input_w/8).U).asUInt(), - (((b * in_dim * in_dim +& irow*in_dim +& icol) * in_channels +& ich) * (input_w/8).U).asUInt()) + val dram_offset = Mux(req.trans_input_3120, (((ich * in_dim * in_dim +& irow*in_dim +& icol) * batches +& b) * (input_w/8).U).asUInt, + (((b * in_dim * in_dim +& irow*in_dim +& icol) * in_channels +& ich) * (input_w/8).U).asUInt) val dram_addr = Mux(is_zeros, 0.U, req.dram_addr + LoopConv.castDramOffset(dram_offset)) val spad_addr = Mux(req.trans_input_3120, // To prevent Verilator errors, we replace some "/ block_size.U" calls here with ">> log2Up(block_size)" - req.addr_start.zext() +& (b >> log2Up(block_size)) * input_spad_stride +& ich * (irows >> req.downsample) * (icols >> req.downsample) +& (irow_padded >> req.downsample) * (icols >> req.downsample) +& (icol_padded >> req.downsample), - req.addr_start.zext() +& (ich >> log2Up(block_size)) * input_spad_stride +& b * (irows >> req.downsample) * (icols >> req.downsample) +& (irow_padded >> req.downsample) * (icols >> req.downsample) +& (icol_padded >> req.downsample)) + req.addr_start.zext +& (b >> log2Up(block_size)) * input_spad_stride +& ich * (irows >> req.downsample) * (icols >> req.downsample) +& (irow_padded >> req.downsample) * (icols >> req.downsample) +& (icol_padded >> req.downsample), + req.addr_start.zext +& (ich >> log2Up(block_size)) * input_spad_stride +& b * (irows >> req.downsample) * (icols >> req.downsample) +& (irow_padded >> req.downsample) * (icols >> req.downsample) +& (icol_padded >> req.downsample)) // Sizes - val block_size_downsampled = (block_size.U << req.downsample).asUInt().zext() + val block_size_downsampled = (block_size.U << req.downsample).asUInt.zext val I = MuxCase( - Mux(icols_unpadded.zext() -& icol > block_size_downsampled, block_size_downsampled, icols_unpadded.zext() -& icol), + Mux(icols_unpadded.zext -& icol > block_size_downsampled, block_size_downsampled, icols_unpadded.zext -& icol), Seq( (icol < 0.S) -> Mux((0.S-&icol) > block_size.S, block_size.S, 0.S-&icol), - (icol >= icols_unpadded.zext()) -> Mux(icols_unpadded.zext() +& undilated(rpad).zext() -& icol > block_size.S, block_size.S, icols_unpadded.zext() +& undilated(rpad).zext() -& icol) + (icol >= icols_unpadded.zext) -> Mux(icols_unpadded.zext +& undilated(rpad).zext -& icol > block_size.S, block_size.S, icols_unpadded.zext +& undilated(rpad).zext -& icol) ) ) val K = Mux(req.trans_input_3120, - Mux(batches.zext() -& b > max_chs_per_mvin, max_chs_per_mvin, batches.zext() -& b), - Mux(ichs.zext() -& ich > max_chs_per_mvin, max_chs_per_mvin, ichs.zext() -& ich)) + Mux(batches.zext -& b > max_chs_per_mvin, max_chs_per_mvin, batches.zext -& b), + Mux(ichs.zext -& ich > max_chs_per_mvin, max_chs_per_mvin, ichs.zext -& ich)) class RoCCCommandWithAddr extends Bundle { val cmd = new RoCCCommand @@ -318,7 +318,7 @@ class LoopConvLdInput(block_size: Int, coreMaxAddrBits: Int, large_iterator_bitw config_cmd_rs1.state_id := 0.U config_cmd_rs1.shrink := 0.U config_cmd_rs1._unused := 1.U - config_cmd.rs1 := config_cmd_rs1.asUInt() + config_cmd.rs1 := config_cmd_rs1.asUInt config_cmd.rs2 := dram_stride << req.downsample @@ -348,10 +348,10 @@ class LoopConvLdInput(block_size: Int, coreMaxAddrBits: Int, large_iterator_bitw io.cmd.bits.rs1 := o.dram_addr val mvin_cmd_rs2 = Wire(mvin_rs2_t.cloneType) mvin_cmd_rs2 := DontCare - mvin_cmd_rs2.num_rows := (o.I >> req.downsample).asUInt() - mvin_cmd_rs2.num_cols := o.K.asUInt() + mvin_cmd_rs2.num_rows := (o.I >> req.downsample).asUInt + mvin_cmd_rs2.num_cols := o.K.asUInt mvin_cmd_rs2.local_addr := cast_to_sp_addr(mvin_cmd_rs2.local_addr, o.spad_addr) - io.cmd.bits.rs2 := mvin_cmd_rs2.asUInt() + io.cmd.bits.rs2 := mvin_cmd_rs2.asUInt } // Sending outputs @@ -359,23 +359,23 @@ class LoopConvLdInput(block_size: Int, coreMaxAddrBits: Int, large_iterator_bitw when (state === config) { state := ld }.otherwise { - val b_it = Mux(req.trans_input_3120, max_chs_per_mvin.asUInt(), 1.U) - val ich_it = Mux(req.trans_input_3120, 1.U, max_chs_per_mvin.asUInt()) + val b_it = Mux(req.trans_input_3120, max_chs_per_mvin.asUInt, 1.U) + val ich_it = Mux(req.trans_input_3120, 1.U, max_chs_per_mvin.asUInt) - val next_ich = sFloorAdd(ich, ich_it, ichs.zext(), 0.S) - val next_icol = sFloorAdd(icol, I.asUInt(), (icols_unpadded +& undilated(rpad)).zext(), 0.S-&undilated(lpad).zext(), + val next_ich = sFloorAdd(ich, ich_it, ichs.zext, 0.S) + val next_icol = sFloorAdd(icol, I.asUInt, (icols_unpadded +& undilated(rpad)).zext, 0.S-&undilated(lpad).zext, next_ich === 0.S) - val next_irow = sFloorAdd(irow, 1.U << req.downsample, (irows_unpadded +& undilated(dpad)).zext(), 0.S-&undilated(upad).zext(), - next_icol === 0.S-&undilated(lpad).zext() && next_ich === 0.S) - val next_b = sFloorAdd(b, b_it, batches.zext(), 0.S, - next_irow === 0.S-&undilated(upad).zext() && next_icol === 0.S-&undilated(lpad).zext() && next_ich === 0.S) + val next_irow = sFloorAdd(irow, 1.U << req.downsample, (irows_unpadded +& undilated(dpad)).zext, 0.S-&undilated(upad).zext, + next_icol === 0.S-&undilated(lpad).zext && next_ich === 0.S) + val next_b = sFloorAdd(b, b_it, batches.zext, 0.S, + next_irow === 0.S-&undilated(upad).zext && next_icol === 0.S-&undilated(lpad).zext && next_ich === 0.S) ich := next_ich icol := next_icol irow := next_irow b := next_b - state := Mux(next_b === 0.S && next_irow === 0.S-&undilated(upad).zext() && next_icol === 0.S-&undilated(lpad).zext() && next_ich === 0.S, + state := Mux(next_b === 0.S && next_irow === 0.S-&undilated(upad).zext && next_icol === 0.S-&undilated(lpad).zext && next_ich === 0.S, idle, ld) } } @@ -385,8 +385,8 @@ class LoopConvLdInput(block_size: Int, coreMaxAddrBits: Int, large_iterator_bitw req := io.req.bits state := config b := 0.S - irow := 0.S -& ((io.req.bits.inner_bounds.upad +& io.req.bits.input_dilated) >> io.req.bits.input_dilated).zext() - icol := 0.S -& ((io.req.bits.inner_bounds.lpad +& io.req.bits.input_dilated) >> io.req.bits.input_dilated).zext() + irow := 0.S -& ((io.req.bits.inner_bounds.upad +& io.req.bits.input_dilated) >> io.req.bits.input_dilated).zext + icol := 0.S -& ((io.req.bits.inner_bounds.lpad +& io.req.bits.input_dilated) >> io.req.bits.input_dilated).zext ich := 0.S } } @@ -530,7 +530,7 @@ class LoopConvLdWeight(block_size: Int, coreMaxAddrBits: Int, large_iterator_bit mvin_cmd_rs2.num_rows := o.K mvin_cmd_rs2.num_cols := o.J mvin_cmd_rs2.local_addr := cast_to_sp_addr(mvin_cmd_rs2.local_addr, o.spad_addr) - io.cmd.bits.rs2 := mvin_cmd_rs2.asUInt() + io.cmd.bits.rs2 := mvin_cmd_rs2.asUInt } // Sending outputs @@ -633,8 +633,8 @@ class LoopConvExecute(block_size: Int, large_iterator_bitwidth: Int, small_itera val ocol = Reg(UInt(small_iterator_bitwidth.W)) // TODO kernel-dilation and input-dilation can never be activated at the same time, so we can optimize out some multiplications by kernel_dilation - val skip_iteration = state >= pre && req.input_dilated && (((krow * kernel_dilation +& orow -& upad)(0) & req.input_dilated).asBool() || - ((kcol * kernel_dilation +& ocol -& lpad)(0) & req.input_dilated).asBool()) + val skip_iteration = state >= pre && req.input_dilated && (((krow * kernel_dilation +& orow -& upad)(0) & req.input_dilated).asBool || + ((kcol * kernel_dilation +& ocol -& lpad)(0) & req.input_dilated).asBool) val pixels = Mux(kcols - kcol > req.max_pixels_per_row, req.max_pixels_per_row, kcols - kcol) @@ -643,7 +643,7 @@ class LoopConvExecute(block_size: Int, large_iterator_bitwidth: Int, small_itera val I = Mux(req.trans_input_3120, Mux(batches - b > block_size.U, block_size.U, batches - b), - undilated(Mux(ocols - ocol > (block_size.U << req.input_dilated).asUInt(), (block_size.U << req.input_dilated).asUInt(), ocols - ocol))) + undilated(Mux(ocols - ocol > (block_size.U << req.input_dilated).asUInt, (block_size.U << req.input_dilated).asUInt, ocols - ocol))) val J = Mux(ochs - och > block_size.U, block_size.U, ochs - och) val K = pixels * Mux(kchs - kch > block_size.U, block_size.U, kchs - kch) @@ -687,16 +687,16 @@ class LoopConvExecute(block_size: Int, large_iterator_bitwidth: Int, small_itera val config_cmd_rs1 = Wire(config_ex_rs1_t.cloneType) config_cmd_rs1 := DontCare - config_cmd_rs1.a_stride := (irows * icols).asUInt() + config_cmd_rs1.a_stride := (irows * icols).asUInt config_cmd_rs1.set_only_strides := 1.U config_cmd_rs1.cmd_type := 0.U val config_cmd_rs2 = Wire(new ConfigExRs2) config_cmd_rs2 := DontCare - config_cmd_rs2.c_stride := (orows * ocols).asUInt() + config_cmd_rs2.c_stride := (orows * ocols).asUInt - config_cmd.rs1 := config_cmd_rs1.asUInt() - config_cmd.rs2 := config_cmd_rs2.asUInt() + config_cmd.rs1 := config_cmd_rs1.asUInt + config_cmd.rs2 := config_cmd_rs2.asUInt val pre_cmd = Wire(new RoCCCommand) // preload pre_cmd := DontCare @@ -735,35 +735,35 @@ class LoopConvExecute(block_size: Int, large_iterator_bitwidth: Int, small_itera val pre_cmd_rs1 = Wire(preload_rs1_t.cloneType) pre_cmd_rs1 := DontCare - pre_cmd_rs1.num_rows := o.K.asUInt() - pre_cmd_rs1.num_cols := o.J.asUInt() + pre_cmd_rs1.num_rows := o.K.asUInt + pre_cmd_rs1.num_cols := o.J.asUInt pre_cmd_rs1.local_addr := Mux(o.new_weights, cast_to_sp_addr(pre_cmd_rs1.local_addr, o.b_addr), garbage_addr(pre_cmd_rs1.local_addr)) val pre_cmd_rs2 = Wire(preload_rs2_t.cloneType) pre_cmd_rs2 := DontCare - pre_cmd_rs2.num_rows := o.I.asUInt() - pre_cmd_rs2.num_cols := o.J.asUInt() + pre_cmd_rs2.num_rows := o.I.asUInt + pre_cmd_rs2.num_cols := o.J.asUInt pre_cmd_rs2.local_addr := cast_to_acc_addr(pre_cmd_rs2.local_addr, o.c_addr, accumulate = true.B, read_full = false.B) - io.cmd.bits.rs1 := pre_cmd_rs1.asUInt() - io.cmd.bits.rs2 := pre_cmd_rs2.asUInt() + io.cmd.bits.rs1 := pre_cmd_rs1.asUInt + io.cmd.bits.rs2 := pre_cmd_rs2.asUInt }.elsewhen(command_p.io.out.bits.cmd.inst.funct =/= CONFIG_CMD) { val o = command_p.io.out.bits val comp_cmd_rs1 = Wire(compute_rs1_t.cloneType) comp_cmd_rs1 := DontCare - comp_cmd_rs1.num_rows := o.I.asUInt() - comp_cmd_rs1.num_cols := o.K.asUInt() + comp_cmd_rs1.num_rows := o.I.asUInt + comp_cmd_rs1.num_cols := o.K.asUInt comp_cmd_rs1.local_addr := cast_to_sp_addr(comp_cmd_rs1.local_addr, o.a_addr) val comp_cmd_rs2 = Wire(compute_rs2_t.cloneType) comp_cmd_rs2 := DontCare - comp_cmd_rs2.num_rows := o.I.asUInt() - comp_cmd_rs2.num_cols := o.J.asUInt() + comp_cmd_rs2.num_rows := o.I.asUInt + comp_cmd_rs2.num_cols := o.J.asUInt comp_cmd_rs2.local_addr := garbage_addr(comp_cmd_rs2.local_addr) - io.cmd.bits.rs1 := comp_cmd_rs1.asUInt() - io.cmd.bits.rs2 := comp_cmd_rs2.asUInt() + io.cmd.bits.rs1 := comp_cmd_rs1.asUInt + io.cmd.bits.rs2 := comp_cmd_rs2.asUInt } // Updating "new_weights" @@ -779,7 +779,7 @@ class LoopConvExecute(block_size: Int, large_iterator_bitwidth: Int, small_itera state := comp }.otherwise { val b_it = Mux(req.trans_input_3120, block_size.U, 1.U) - val ocol_it = Mux(skip_iteration || req.trans_input_3120, 1.U, block_size.U << req.input_dilated).asUInt() + val ocol_it = Mux(skip_iteration || req.trans_input_3120, 1.U, block_size.U << req.input_dilated).asUInt val next_ocol = floorAdd(ocol, ocol_it, ocols) val next_orow = floorAdd(orow, 1.U, orows, next_ocol === 0.U) @@ -928,13 +928,13 @@ class LoopConvSt(block_size: Int, coreMaxAddrBits: Int, large_iterator_bitwidth: pre_pool_config_cmd_rs1.pool_stride := pool_stride pre_pool_config_cmd_rs1.activation := req.activation pre_pool_config_cmd_rs1.cmd_type := CONFIG_STORE - pre_pool_config_cmd.rs1 := pre_pool_config_cmd_rs1.asUInt() + pre_pool_config_cmd.rs1 := pre_pool_config_cmd_rs1.asUInt val pre_pool_config_cmd_rs2 = Wire(config_mvout_rs2_t.cloneType) pre_pool_config_cmd_rs2 := DontCare pre_pool_config_cmd_rs2.acc_scale := ACC_SCALE_NO_CHANGE pre_pool_config_cmd_rs2.stride := out_channels * (input_w / 8).U - pre_pool_config_cmd.rs2 := pre_pool_config_cmd_rs2.asUInt() + pre_pool_config_cmd.rs2 := pre_pool_config_cmd_rs2.asUInt val post_pool_config_cmd = Wire(new RoCCCommand) post_pool_config_cmd := DontCare @@ -944,13 +944,13 @@ class LoopConvSt(block_size: Int, coreMaxAddrBits: Int, large_iterator_bitwidth: post_pool_config_cmd_rs1 := DontCare post_pool_config_cmd_rs1.activation := req.activation post_pool_config_cmd_rs1.cmd_type := CONFIG_STORE - post_pool_config_cmd.rs1 := post_pool_config_cmd_rs1.asUInt() + post_pool_config_cmd.rs1 := post_pool_config_cmd_rs1.asUInt val post_pool_config_cmd_rs2 = Wire(config_mvout_rs2_t.cloneType) post_pool_config_cmd_rs2 := DontCare post_pool_config_cmd_rs2.acc_scale := ACC_SCALE_NO_CHANGE post_pool_config_cmd_rs2.stride := out_channels * (input_w / 8).U - post_pool_config_cmd.rs2 := post_pool_config_cmd_rs2.asUInt() + post_pool_config_cmd.rs2 := post_pool_config_cmd_rs2.asUInt val pool_cmd = Wire(new RoCCCommand) pool_cmd := DontCare @@ -990,16 +990,16 @@ class LoopConvSt(block_size: Int, coreMaxAddrBits: Int, large_iterator_bitwidth: pool_mvout_cmd_rs2.local_addr := cast_to_acc_addr(pool_mvout_cmd_rs2.local_addr, o.pool_spad_addr, accumulate = false.B, read_full = false.B) io.cmd.bits.rs1 := o.pool_dram_addr - io.cmd.bits.rs2 := pool_mvout_cmd_rs2.asUInt() + io.cmd.bits.rs2 := pool_mvout_cmd_rs2.asUInt } .otherwise { val mvout_cmd_rs2 = Wire(mvout_rs2_t.cloneType) mvout_cmd_rs2 := DontCare - mvout_cmd_rs2.num_rows := o.I.asUInt() - mvout_cmd_rs2.num_cols := o.J.asUInt() + mvout_cmd_rs2.num_rows := o.I.asUInt + mvout_cmd_rs2.num_cols := o.J.asUInt mvout_cmd_rs2.local_addr := cast_to_acc_addr(mvout_cmd_rs2.local_addr, o.spad_addr, accumulate = false.B, read_full = false.B) io.cmd.bits.rs1 := o.dram_addr - io.cmd.bits.rs2 := mvout_cmd_rs2.asUInt() + io.cmd.bits.rs2 := mvout_cmd_rs2.asUInt } } @@ -1182,7 +1182,7 @@ class LoopConv (block_size: Int, coreMaxAddrBits: Int, reservation_station_size: val concurrent_loops = 2 val loops = Reg(Vec(concurrent_loops, new LoopConvState(block_size, large_iterator_bitwidth, small_iterator_bitwidth, tiny_iterator_bitwidth, coreMaxAddrBits, max_addr, max_acc_addr))) val head_loop_id = RegInit(0.U(log2Up(concurrent_loops).W)) - val tail_loop_id = (~head_loop_id).asUInt() // This is the loop that we always try to configure if available + val tail_loop_id = (~head_loop_id).asUInt // This is the loop that we always try to configure if available val head_loop = loops(head_loop_id) val tail_loop = loops(tail_loop_id) @@ -1499,7 +1499,7 @@ class LoopConv (block_size: Int, coreMaxAddrBits: Int, reservation_station_size: } // Resets - when (reset.asBool()) { + when (reset.asBool) { loops.zipWithIndex.foreach { case (l, i) => l.reset() l.a_addr_start := (i * (max_addr / concurrent_loops)).U diff --git a/src/main/scala/gemmini/LoopMatmul.scala b/src/main/scala/gemmini/LoopMatmul.scala index 86552d56..a33155e9 100644 --- a/src/main/scala/gemmini/LoopMatmul.scala +++ b/src/main/scala/gemmini/LoopMatmul.scala @@ -76,10 +76,10 @@ class LoopMatmulLdA(block_size: Int, coreMaxAddrBits: Int, iterator_bitwidth: In val mvin_cmd_rs2 = Wire(mvin_rs2_t.cloneType) mvin_cmd_rs2 := DontCare - mvin_cmd_rs2.num_rows := rows.asUInt() - mvin_cmd_rs2.num_cols := cols.asUInt() + mvin_cmd_rs2.num_rows := rows.asUInt + mvin_cmd_rs2.num_cols := cols.asUInt mvin_cmd_rs2.local_addr := cast_to_sp_addr(mvin_cmd_rs2.local_addr, sp_addr) - mvin_cmd.rs2 := mvin_cmd_rs2.asUInt() + mvin_cmd.rs2 := mvin_cmd_rs2.asUInt io.req.ready := state === idle io.i := i @@ -184,10 +184,10 @@ class LoopMatmulLdB(block_size: Int, coreMaxAddrBits: Int, iterator_bitwidth: In val mvin_cmd_rs2 = Wire(mvin_rs2_t.cloneType) mvin_cmd_rs2 := DontCare - mvin_cmd_rs2.num_rows := rows.asUInt() - mvin_cmd_rs2.num_cols := cols.asUInt() + mvin_cmd_rs2.num_rows := rows.asUInt + mvin_cmd_rs2.num_cols := cols.asUInt mvin_cmd_rs2.local_addr := cast_to_sp_addr(mvin_cmd_rs2.local_addr, sp_addr) - mvin_cmd.rs2 := mvin_cmd_rs2.asUInt() + mvin_cmd.rs2 := mvin_cmd_rs2.asUInt io.req.ready := state === idle io.k := k @@ -281,10 +281,10 @@ class LoopMatmulLdD(block_size: Int, coreMaxAddrBits: Int, iterator_bitwidth: In val mvin_cmd_rs2 = Wire(mvin_rs2_t.cloneType) mvin_cmd_rs2 := DontCare - mvin_cmd_rs2.num_rows := rows.asUInt() - mvin_cmd_rs2.num_cols := cols.asUInt() + mvin_cmd_rs2.num_rows := rows.asUInt + mvin_cmd_rs2.num_cols := cols.asUInt mvin_cmd_rs2.local_addr := cast_to_acc_addr(mvin_cmd_rs2.local_addr, sp_addr, accumulate = false.B, read_full = false.B) - mvin_cmd.rs2 := mvin_cmd_rs2.asUInt() + mvin_cmd.rs2 := mvin_cmd_rs2.asUInt io.req.ready := state === idle io.idle := state === idle @@ -401,19 +401,19 @@ class LoopMatmulExecute(block_size: Int, coreMaxAddrBits: Int, iterator_bitwidth val pre_cmd_rs1 = Wire(preload_rs1_t.cloneType) pre_cmd_rs1 := DontCare - pre_cmd_rs1.num_rows := b_rows.asUInt() - pre_cmd_rs1.num_cols := b_cols.asUInt() + pre_cmd_rs1.num_rows := b_rows.asUInt + pre_cmd_rs1.num_cols := b_cols.asUInt pre_cmd_rs1.local_addr := Mux(i === 0.U, cast_to_sp_addr(pre_cmd_rs1.local_addr, b_addr), garbage_addr(pre_cmd_rs1.local_addr)) val pre_cmd_rs2 = Wire(preload_rs2_t.cloneType) pre_cmd_rs2 := DontCare - pre_cmd_rs2.num_rows := c_rows.asUInt() - pre_cmd_rs2.num_cols := c_cols.asUInt() + pre_cmd_rs2.num_rows := c_rows.asUInt + pre_cmd_rs2.num_cols := c_cols.asUInt pre_cmd_rs2.local_addr := cast_to_acc_addr(pre_cmd_rs2.local_addr, c_addr, accumulate = req.accumulate || k =/= 0.U, read_full = false.B) - pre_cmd.rs1 := pre_cmd_rs1.asUInt() - pre_cmd.rs2 := pre_cmd_rs2.asUInt() + pre_cmd.rs1 := pre_cmd_rs1.asUInt + pre_cmd.rs2 := pre_cmd_rs2.asUInt val comp_cmd = Wire(new RoCCCommand()) comp_cmd := DontCare @@ -421,8 +421,8 @@ class LoopMatmulExecute(block_size: Int, coreMaxAddrBits: Int, iterator_bitwidth val comp_cmd_rs1 = Wire(compute_rs1_t.cloneType) comp_cmd_rs1 := DontCare - comp_cmd_rs1.num_rows := a_rows.asUInt() - comp_cmd_rs1.num_cols := a_cols.asUInt() + comp_cmd_rs1.num_rows := a_rows.asUInt + comp_cmd_rs1.num_cols := a_cols.asUInt comp_cmd_rs1.local_addr := cast_to_sp_addr(comp_cmd_rs1.local_addr, a_addr) val comp_cmd_rs2 = Wire(compute_rs2_t.cloneType) @@ -431,8 +431,8 @@ class LoopMatmulExecute(block_size: Int, coreMaxAddrBits: Int, iterator_bitwidth comp_cmd_rs2.num_cols := block_size.U comp_cmd_rs2.local_addr := garbage_addr(comp_cmd_rs2.local_addr) - comp_cmd.rs1 := comp_cmd_rs1.asUInt() - comp_cmd.rs2 := comp_cmd_rs2.asUInt() + comp_cmd.rs1 := comp_cmd_rs1.asUInt + comp_cmd.rs2 := comp_cmd_rs2.asUInt io.req.ready := state === idle io.k := k @@ -528,7 +528,7 @@ class LoopMatmulStC(block_size: Int, coreMaxAddrBits: Int, iterator_bitwidth: In val j = Reg(UInt(iterator_bitwidth.W)) val i = Reg(UInt(iterator_bitwidth.W)) - val acc_addr_start = /*(BigInt(1) << 31).U | (req.full_c << 29.U).asUInt() |*/ req.addr_start + val acc_addr_start = /*(BigInt(1) << 31).U | (req.full_c << 29.U).asUInt |*/ req.addr_start val dram_offset = Mux(req.full_c, (i * req.dram_stride + j) * block_size.U * (acc_w/8).U, (i * req.dram_stride + j) * block_size.U * (input_w/8).U) @@ -545,10 +545,10 @@ class LoopMatmulStC(block_size: Int, coreMaxAddrBits: Int, iterator_bitwidth: In val mvout_cmd_rs2 = Wire(mvout_rs2_t.cloneType) mvout_cmd_rs2 := DontCare - mvout_cmd_rs2.num_rows := rows.asUInt() - mvout_cmd_rs2.num_cols := cols.asUInt() + mvout_cmd_rs2.num_rows := rows.asUInt + mvout_cmd_rs2.num_cols := cols.asUInt mvout_cmd_rs2.local_addr := cast_to_acc_addr(mvout_cmd_rs2.local_addr, sp_addr, accumulate = false.B, read_full = req.full_c) - mvout_cmd.rs2 := mvout_cmd_rs2.asUInt() + mvout_cmd.rs2 := mvout_cmd_rs2.asUInt // Layernorm iterators and calculations val ln_row = Reg(UInt(iterator_bitwidth.W)) @@ -585,7 +585,7 @@ class LoopMatmulStC(block_size: Int, coreMaxAddrBits: Int, iterator_bitwidth: In val ln_config_norm = Wire(new RoCCCommand) ln_config_norm := DontCare ln_config_norm.inst.funct := CONFIG_CMD - ln_config_norm.rs1 := ln_config_norm_rs1.asUInt() + ln_config_norm.rs1 := ln_config_norm_rs1.asUInt ln_config_norm.rs2 := DontCare val ln_mvout_cmd = Wire(new RoCCCommand) @@ -596,10 +596,10 @@ class LoopMatmulStC(block_size: Int, coreMaxAddrBits: Int, iterator_bitwidth: In val ln_mvout_cmd_rs2 = Wire(mvout_rs2_t.cloneType) ln_mvout_cmd_rs2 := DontCare ln_mvout_cmd_rs2.num_rows := 1.U - ln_mvout_cmd_rs2.num_cols := cols.asUInt() + ln_mvout_cmd_rs2.num_cols := cols.asUInt ln_mvout_cmd_rs2.local_addr := cast_to_acc_addr(ln_mvout_cmd_rs2.local_addr, ln_sp_addr, accumulate = false.B, read_full = req.full_c) ln_mvout_cmd_rs2.local_addr.norm_cmd := ln_norm_cmd - ln_mvout_cmd.rs2 := ln_mvout_cmd_rs2.asUInt() + ln_mvout_cmd.rs2 := ln_mvout_cmd_rs2.asUInt io.req.ready := state === idle io.j := j @@ -760,7 +760,7 @@ class LoopMatmul(block_size: Int, coreMaxAddrBits: Int, reservation_station_size val concurrent_loops = 2 val loops = Reg(Vec(concurrent_loops, new LoopMatmulState(iterator_bitwidth, coreMaxAddrBits, max_addr, max_acc_addr))) val head_loop_id = Reg(UInt(log2Up(concurrent_loops).W)) - val tail_loop_id = (~head_loop_id).asUInt() // This is the loop that we always try to configure if available + val tail_loop_id = (~head_loop_id).asUInt // This is the loop that we always try to configure if available val head_loop = loops(head_loop_id) val tail_loop = loops(tail_loop_id) @@ -1052,7 +1052,7 @@ class LoopMatmul(block_size: Int, coreMaxAddrBits: Int, reservation_station_size } // Resets - when (reset.asBool()) { + when (reset.asBool) { loops.zipWithIndex.foreach { case (l, i) => l.reset() l.a_addr_start := (i * (max_addr / concurrent_loops)).U diff --git a/src/main/scala/gemmini/LoopUnroller.scala b/src/main/scala/gemmini/LoopUnroller.scala index 02ac7d71..63a0150b 100644 --- a/src/main/scala/gemmini/LoopUnroller.scala +++ b/src/main/scala/gemmini/LoopUnroller.scala @@ -42,8 +42,8 @@ class LoopUnroller(block_size: Int)(implicit p: Parameters) extends Module { val a_start = cmd.bits.rs1(31, 0) val b_start = cmd.bits.rs1(63, 32) - val c_start = (3.U << 30).asUInt() - val d_start = (1.U << 31).asUInt() + val c_start = (3.U << 30).asUInt + val d_start = (1.U << 31).asUInt // TODO get rid of the x * max_y multiplications here val a_addr = a_start + (i * max_k + k) * block_size.U diff --git a/src/main/scala/gemmini/MeshWithDelays.scala b/src/main/scala/gemmini/MeshWithDelays.scala index d0aced16..516760bf 100644 --- a/src/main/scala/gemmini/MeshWithDelays.scala +++ b/src/main/scala/gemmini/MeshWithDelays.scala @@ -248,7 +248,7 @@ class MeshWithDelays[T <: Data: Arithmetic, U <: TagQueueTag with Data] io.req.ready := (!req.valid || last_fire) && tagq.io.enq.ready && total_rows_q.io.enq.ready io.tags_in_progress := tagq.io.all.map(_.tag) - when (reset.asBool()) { + when (reset.asBool) { req.valid := false.B } diff --git a/src/main/scala/gemmini/Normalizer.scala b/src/main/scala/gemmini/Normalizer.scala index 89dca2db..67dd18ac 100644 --- a/src/main/scala/gemmini/Normalizer.scala +++ b/src/main/scala/gemmini/Normalizer.scala @@ -351,7 +351,7 @@ class Normalizer[T <: Data, U <: Data](max_len: Int, num_reduce_lanes: Int, num_ assert(acc_t.isInstanceOf[SInt]) when (stat.state === waiting_for_stddev) { - stat.inv_stddev := Mux(sqrt_out.bits.asUInt() === acc_t.zero.asUInt(), + stat.inv_stddev := Mux(sqrt_out.bits.asUInt === acc_t.zero.asUInt, 1.S(acc_t.getWidth.W).asTypeOf(acc_t), sqrt_out.bits ) @@ -405,14 +405,14 @@ class Normalizer[T <: Data, U <: Data](max_len: Int, num_reduce_lanes: Int, num_ def in_to_float(x: SInt) = { val in_to_rec_fn = Module(new INToRecFN(intWidth = sum_exp_to_inv.getWidth, expWidth, sigWidth)) in_to_rec_fn.io.signedIn := true.B - in_to_rec_fn.io.in := x.asUInt() + in_to_rec_fn.io.in := x.asUInt in_to_rec_fn.io.roundingMode := consts.round_near_even // consts.round_near_maxMag in_to_rec_fn.io.detectTininess := consts.tininess_afterRounding in_to_rec_fn.io.out } - val self_rec = in_to_float(sum_exp_to_inv.asUInt().asSInt()) + val self_rec = in_to_float(sum_exp_to_inv.asUInt.asSInt) val one_rec = in_to_float(127.S) // softmax maximum is 127 for signed int8 // Instantiate the hardloat divider @@ -436,7 +436,7 @@ class Normalizer[T <: Data, U <: Data](max_len: Int, num_reduce_lanes: Int, num_ val stat = stats(sum_exp_to_inv_id) exp_divider_in.valid := (stat.state === get_inv_sum_exp) && !lanes.io.busy - exp_divider_in.bits := sum_exp_to_inv.asUInt() + exp_divider_in.bits := sum_exp_to_inv.asUInt } { @@ -587,7 +587,7 @@ class Normalizer[T <: Data, U <: Data](max_len: Int, num_reduce_lanes: Int, num_ assert(acc_t.getWidth == scale_t.getWidth, "we use the same variable to hold both the variance and the inv-stddev, so we need them to see the width") // Resets - when (reset.asBool()) { + when (reset.asBool) { stats.foreach(_.state := idle) stats.foreach(_.sum := acc_t.zero) stats.foreach(_.max := acc_t.minimum) diff --git a/src/main/scala/gemmini/PixelRepeater.scala b/src/main/scala/gemmini/PixelRepeater.scala index ddab4422..ecf9481c 100644 --- a/src/main/scala/gemmini/PixelRepeater.scala +++ b/src/main/scala/gemmini/PixelRepeater.scala @@ -48,8 +48,8 @@ class PixelRepeater[T <: Data, Tag <: Data](t: T, laddr_t: LocalAddr, block_cols val out_shift = Wire(UInt(log2Up(block_cols / 2 + 1).W)) out_shift := req.bits.pixel_repeats * req.bits.len - io.resp.bits.out := (req.bits.in.asUInt() << (out_shift * t.getWidth.U)).asTypeOf(io.resp.bits.out) - io.resp.bits.mask := (req.bits.mask.asUInt() << (out_shift * ((t.getWidth / 8) / aligned_to).U)).asTypeOf(io.resp.bits.mask) + io.resp.bits.out := (req.bits.in.asUInt << (out_shift * t.getWidth.U)).asTypeOf(io.resp.bits.out) + io.resp.bits.mask := (req.bits.mask.asUInt << (out_shift * ((t.getWidth / 8) / aligned_to).U)).asTypeOf(io.resp.bits.mask) io.resp.bits.last := req.bits.last && (req.bits.pixel_repeats === 0.U) io.resp.bits.tag := req.bits.tag @@ -84,7 +84,7 @@ class PixelRepeater[T <: Data, Tag <: Data](t: T, laddr_t: LocalAddr, block_cols req.bits.pixel_repeats := io.req.bits.pixel_repeats - 1.U } - when(reset.asBool()) { + when(reset.asBool) { req.pop() } } diff --git a/src/main/scala/gemmini/ReservationStation.scala b/src/main/scala/gemmini/ReservationStation.scala index 68d0e6e7..47dd5ef1 100644 --- a/src/main/scala/gemmini/ReservationStation.scala +++ b/src/main/scala/gemmini/ReservationStation.scala @@ -558,7 +558,7 @@ class ReservationStation[T <: Data : Arithmetic, U <: Data, V <: Data](config: G PerfCounter(!io.alloc.ready, "reservation_station_full", "cycles where reservation station is full") } - when (reset.asBool()) { + when (reset.asBool) { entries.foreach(_.valid := false.B) } diff --git a/src/main/scala/gemmini/Scratchpad.scala b/src/main/scala/gemmini/Scratchpad.scala index 70c9140f..91870c80 100644 --- a/src/main/scala/gemmini/Scratchpad.scala +++ b/src/main/scala/gemmini/Scratchpad.scala @@ -149,9 +149,9 @@ class ScratchpadBank(n: Int, w: Int, aligned_to: Int, single_ported: Boolean, us val ren = io.read.req.fire val rdata = if (single_ported) { assert(!(ren && io.write.en)) - read(raddr, ren && !io.write.en).asUInt() + read(raddr, ren && !io.write.en).asUInt } else { - read(raddr, ren).asUInt() + read(raddr, ren).asUInt } val fromDMA = io.read.req.bits.fromDMA @@ -537,7 +537,7 @@ class Scratchpad[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T, bio.write.mask := io.srams.write(i).mask }.elsewhen (dmaread) { bio.write.addr := laddr.sp_row() - bio.write.data := mvin_scale_pixel_repeater.io.resp.bits.out.asUInt() + bio.write.data := mvin_scale_pixel_repeater.io.resp.bits.out.asUInt bio.write.mask := mvin_scale_pixel_repeater.io.resp.bits.mask take ((spad_w / (aligned_to * 8)) max 1) mvin_scale_pixel_repeater.io.resp.ready := true.B // TODO we combinationally couple valid and ready signals diff --git a/src/main/scala/gemmini/StoreController.scala b/src/main/scala/gemmini/StoreController.scala index c9e4fdbb..bf4a71b2 100644 --- a/src/main/scala/gemmini/StoreController.scala +++ b/src/main/scala/gemmini/StoreController.scala @@ -226,7 +226,7 @@ class StoreController[T <: Data : Arithmetic, U <: Data, V <: Data](config: Gemm stride := config_stride activation := config_activation - when (!config_acc_scale.asUInt().andR()) { + when (!config_acc_scale.asUInt.andR) { acc_scale := config_acc_scale.asTypeOf(acc_scale_t) } @@ -248,7 +248,7 @@ class StoreController[T <: Data : Arithmetic, U <: Data, V <: Data](config: Gemm cmd.ready := true.B } .elsewhen(config.has_normalizations.B && DoConfigNorm) { - when (!config_set_stats_id_only.asBool()) { + when (!config_set_stats_id_only.asBool) { igelu_qb := config_igelu_qb.asTypeOf(igelu_qb) igelu_qc := config_igelu_qc.asTypeOf(igelu_qc) when(config_iexp_q_const_type === 0.U) { diff --git a/src/main/scala/gemmini/TagQueue.scala b/src/main/scala/gemmini/TagQueue.scala index 9a6464c3..f656119a 100644 --- a/src/main/scala/gemmini/TagQueue.scala +++ b/src/main/scala/gemmini/TagQueue.scala @@ -44,7 +44,7 @@ class TagQueue[T <: Data with TagQueueTag](t: T, entries: Int) extends Module { len := len - 1.U } - when (reset.asBool()) { + when (reset.asBool) { regs.foreach(_.make_this_garbage()) } diff --git a/src/main/scala/gemmini/TilerScheduler.scala b/src/main/scala/gemmini/TilerScheduler.scala index c09ff949..d9fd8b6a 100644 --- a/src/main/scala/gemmini/TilerScheduler.scala +++ b/src/main/scala/gemmini/TilerScheduler.scala @@ -253,7 +253,7 @@ class TilerScheduler[T <: Data: Arithmetic, U <: Data, V <: Data] Cat(older_in_same_q) | Cat(is_st_and_must_wait_for_prior_ex_config) | Cat(is_ex_config_and_must_wait_for_prior_st) - ).asBools().reverse + ).asBools.reverse new_entry.complete_on_issue := new_entry.is_config && new_entry.q =/= exq @@ -440,7 +440,7 @@ class TilerScheduler[T <: Data: Arithmetic, U <: Data, V <: Data] printf(p"Last allocated: $last_allocated\n\n") } - when (reset.asBool()) { + when (reset.asBool) { entries.foreach(_.valid := false.B) } } diff --git a/src/main/scala/gemmini/Util.scala b/src/main/scala/gemmini/Util.scala index 51dc1377..07c36554 100644 --- a/src/main/scala/gemmini/Util.scala +++ b/src/main/scala/gemmini/Util.scala @@ -47,9 +47,9 @@ object Util { def sFloorAdd(s: SInt, n: UInt, max_plus_one: SInt, min: SInt, en: Bool = true.B): SInt = { val max = max_plus_one - 1.S - MuxCase(s + n.zext(), Seq( + MuxCase(s + n.zext, Seq( (!en) -> s, - ((s +& n.zext()) > max) -> min + ((s +& n.zext) > max) -> min )) } @@ -66,22 +66,22 @@ object Util { def closestLowerPowerOf2(u: UInt): UInt = { // TODO figure out a more efficient way of doing this. Is this many muxes really necessary? - val exp = u.asBools().zipWithIndex.map { case (b, i) => + val exp = u.asBools.zipWithIndex.map { case (b, i) => Mux(b, i.U, 0.U) }.reduce((acc, u) => Mux(acc > u, acc, u)) - (1.U << exp).asUInt() + (1.U << exp).asUInt } def closestAlignedLowerPowerOf2(u: UInt, addr: UInt, stride: UInt, rowBytes: Int): UInt = { val lgRowBytes = log2Ceil(rowBytes) // TODO figure out a more efficient way of doing this. Is this many muxes really necessary? - val exp = u.asBools().zipWithIndex.map { case (b, i) => + val exp = u.asBools.zipWithIndex.map { case (b, i) => Mux(b && addr(i + lgRowBytes - 1, 0) === 0.U && stride(i + lgRowBytes - 1, 0) === 0.U, i.U, 0.U) }.reduce((acc, u) => Mux(acc > u, acc, u)) - (1.U << exp).asUInt() + (1.U << exp).asUInt } // This function will return "next" with a 0-cycle delay when the "enable" signal is high. It's like a queue with diff --git a/src/main/scala/gemmini/XactTracker.scala b/src/main/scala/gemmini/XactTracker.scala index 277626a1..afd8f964 100644 --- a/src/main/scala/gemmini/XactTracker.scala +++ b/src/main/scala/gemmini/XactTracker.scala @@ -84,7 +84,7 @@ class XactTracker[U <: Data](nXacts: Int, maxShift: Int, spadWidth: Int, accWidt assert(entries(io.peek.xactid).valid) } - when (reset.asBool()) { + when (reset.asBool) { entries.foreach(_.valid := false.B) } diff --git a/src/main/scala/gemmini/ZeroWriter.scala b/src/main/scala/gemmini/ZeroWriter.scala index a1834a41..17da09e1 100644 --- a/src/main/scala/gemmini/ZeroWriter.scala +++ b/src/main/scala/gemmini/ZeroWriter.scala @@ -70,7 +70,7 @@ class ZeroWriter[T <: Data, U <: Data, V <: Data, Tag <: Data](config: GemminiAr col_counter := 0.U } - when (reset.asBool()) { + when (reset.asBool) { req.pop() } } From 7d418c78d822ca465e4b5b3e71b6d7cb19037239 Mon Sep 17 00:00:00 2001 From: Jerry Zhao Date: Wed, 4 Jan 2023 01:24:26 -0800 Subject: [PATCH 19/64] Bump to scala 2.13/new rocketchip --- build.sbt | 2 +- src/main/scala/gemmini/Controller.scala | 2 +- src/main/scala/gemmini/DMA.scala | 9 ++++++--- src/main/scala/gemmini/Scratchpad.scala | 4 ++-- 4 files changed, 10 insertions(+), 7 deletions(-) diff --git a/build.sbt b/build.sbt index b34575a0..de8bd4e5 100644 --- a/build.sbt +++ b/build.sbt @@ -4,7 +4,7 @@ name := "gemmini" version := "3.1.0" -scalaVersion := "2.12.10" +scalaVersion := "2.13.10" libraryDependencies ++= Seq( "edu.berkeley.cs" %% "chisel3" % "3.4.+", diff --git a/src/main/scala/gemmini/Controller.scala b/src/main/scala/gemmini/Controller.scala index 2c15d3ea..d4c5f7d5 100644 --- a/src/main/scala/gemmini/Controller.scala +++ b/src/main/scala/gemmini/Controller.scala @@ -65,7 +65,7 @@ class GemminiModule[T <: Data: Arithmetic, U <: Data, V <: Data] counters.io.event_io.collect(spad.module.io.counter) // TLB - implicit val edge = outer.node.edges.out.head + implicit val edge = outer.spad.id_node.edges.out.head val tlb = Module(new FrontendTLB(2, tlb_size, dma_maxbytes, use_tlb_register_filter, use_firesim_simulation_counters, use_shared_tlb)) (tlb.io.clients zip outer.spad.module.io.tlb).foreach(t => t._1 <> t._2) diff --git a/src/main/scala/gemmini/DMA.scala b/src/main/scala/gemmini/DMA.scala index 71148b67..12a7509f 100644 --- a/src/main/scala/gemmini/DMA.scala +++ b/src/main/scala/gemmini/DMA.scala @@ -58,7 +58,8 @@ class StreamReader[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T val core = LazyModule(new StreamReaderCore(config, nXacts, beatBits, maxBytes, spadWidth, accWidth, aligned_to, spad_rows, acc_rows, meshRows, use_tlb_register_filter, use_firesim_simulation_counters)) val node = core.node - lazy val module = new LazyModuleImp(this) { + lazy val module = new Impl + class Impl extends LazyModuleImp(this) { val io = IO(new Bundle { val req = Flipped(Decoupled(new StreamReadRequest(spad_rows, acc_rows, config.mvin_scale_t_bits))) @@ -134,7 +135,8 @@ class StreamReaderCore[T <: Data, U <: Data, V <: Data](config: GemminiArrayConf // TODO when we request data from multiple rows which are actually contiguous in main memory, we should merge them into fewer requests - lazy val module = new LazyModuleImp(this) with HasCoreParameters with MemoryOpConstants { + lazy val module = new Impl + class Impl extends LazyModuleImp(this) with HasCoreParameters with MemoryOpConstants { val (tl, edge) = node.out(0) val spadWidthBytes = spadWidth / 8 @@ -353,7 +355,8 @@ class StreamWriter[T <: Data: Arithmetic](nXacts: Int, beatBits: Int, maxBytes: require(isPow2(aligned_to)) - lazy val module = new LazyModuleImp(this) with HasCoreParameters with MemoryOpConstants { + lazy val module = new Impl + class Impl extends LazyModuleImp(this) with HasCoreParameters with MemoryOpConstants { val (tl, edge) = node.out(0) val dataBytes = dataWidth / 8 val beatBytes = beatBits / 8 diff --git a/src/main/scala/gemmini/Scratchpad.scala b/src/main/scala/gemmini/Scratchpad.scala index 70c9140f..0d80efb9 100644 --- a/src/main/scala/gemmini/Scratchpad.scala +++ b/src/main/scala/gemmini/Scratchpad.scala @@ -201,8 +201,8 @@ class Scratchpad[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T, xbar_node := TLBuffer() := writer.node id_node := TLWidthWidget(config.dma_buswidth/8) := TLBuffer() := xbar_node - lazy val module = new LazyModuleImp(this) with HasCoreParameters { - + lazy val module = new Impl + class Impl extends LazyModuleImp(this) with HasCoreParameters { val io = IO(new Bundle { // DMA ports val dma = new Bundle { From 49494fcfce24798cd6da9afc7918135286e158d3 Mon Sep 17 00:00:00 2001 From: Hasan Genc Date: Fri, 20 Jan 2023 21:41:02 -0800 Subject: [PATCH 20/64] remove repeated lines caused by merging branches --- src/main/scala/gemmini/LoopMatmul.scala | 51 ------------------------- 1 file changed, 51 deletions(-) diff --git a/src/main/scala/gemmini/LoopMatmul.scala b/src/main/scala/gemmini/LoopMatmul.scala index 92ec8fd5..a33155e9 100644 --- a/src/main/scala/gemmini/LoopMatmul.scala +++ b/src/main/scala/gemmini/LoopMatmul.scala @@ -601,57 +601,6 @@ class LoopMatmulStC(block_size: Int, coreMaxAddrBits: Int, iterator_bitwidth: In ln_mvout_cmd_rs2.local_addr.norm_cmd := ln_norm_cmd ln_mvout_cmd.rs2 := ln_mvout_cmd_rs2.asUInt - // Layernorm iterators and calculations - val ln_row = Reg(UInt(iterator_bitwidth.W)) - val ln_cmd = Reg(UInt(iterator_bitwidth.W)) - val ln_stat_id = Reg(UInt(iterator_bitwidth.W)) - - val NORM_STAT_IDS = 4 // TODO magic number - - val ln_norm_cmds = VecInit(VecInit(NormCmd.SUM, NormCmd.MEAN), VecInit(NormCmd.VARIANCE, NormCmd.INV_STDDEV), - VecInit(NormCmd.RESET, NormCmd.RESET)) - - val sm_norm_cmds = VecInit(VecInit(NormCmd.MAX, NormCmd.MAX), VecInit(NormCmd.SUM_EXP, NormCmd.INV_SUM_EXP), - VecInit(NormCmd.RESET, NormCmd.RESET)) - - val ln_stat_ids = Mux(rows -& ln_row > NORM_STAT_IDS.U, NORM_STAT_IDS.U, rows -& ln_row) - - val ln_r = ln_row +& ln_stat_id - - val ln_sp_addr = acc_addr_start +& (i * req.max_j +& j) * block_size.U +& ln_r - val ln_norm_cmd = Mux(j +& max_blocks >= req.max_j, - Mux(req.act === Activation.LAYERNORM, ln_norm_cmds(ln_cmd)(1), sm_norm_cmds(ln_cmd)(1)), - Mux(req.act === Activation.LAYERNORM, ln_norm_cmds(ln_cmd)(0), sm_norm_cmds(ln_cmd)(0))) - - // TODO we assume for now that full_C and layernorm aren't true at the same - val ln_dram_offset = ((i * req.dram_stride +& j) * block_size.U +& ln_r * req.dram_stride) * (input_w/8).U - val ln_dram_addr = req.dram_addr + LoopMatmul.castDramOffset(ln_dram_offset) - - val ln_config_norm_rs1 = Wire(new GemminiISA.ConfigNormRs1) - ln_config_norm_rs1 := DontCare - ln_config_norm_rs1.set_stats_id_only := 1.U - ln_config_norm_rs1.cmd_type := CONFIG_NORM - ln_config_norm_rs1.norm_stats_id := ln_stat_id - - val ln_config_norm = Wire(new RoCCCommand) - ln_config_norm := DontCare - ln_config_norm.inst.funct := CONFIG_CMD - ln_config_norm.rs1 := ln_config_norm_rs1.asUInt() - ln_config_norm.rs2 := DontCare - - val ln_mvout_cmd = Wire(new RoCCCommand) - ln_mvout_cmd := DontCare - ln_mvout_cmd.inst.funct := STORE_CMD - ln_mvout_cmd.rs1 := ln_dram_addr - - val ln_mvout_cmd_rs2 = Wire(mvout_rs2_t.cloneType) - ln_mvout_cmd_rs2 := DontCare - ln_mvout_cmd_rs2.num_rows := 1.U - ln_mvout_cmd_rs2.num_cols := cols.asUInt() - ln_mvout_cmd_rs2.local_addr := cast_to_acc_addr(ln_mvout_cmd_rs2.local_addr, ln_sp_addr, accumulate = false.B, read_full = req.full_c) - ln_mvout_cmd_rs2.local_addr.norm_cmd := ln_norm_cmd - ln_mvout_cmd.rs2 := ln_mvout_cmd_rs2.asUInt() - io.req.ready := state === idle io.j := j io.i := i From ff8bda845796ee663983b832410df6f744bffaa0 Mon Sep 17 00:00:00 2001 From: SingularityKChen Date: Sat, 21 Jan 2023 14:33:59 +0800 Subject: [PATCH 21/64] fix: Remove parenthesized forms of asUInt() (#273) --- src/main/scala/gemmini/AccumulatorScale.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/scala/gemmini/AccumulatorScale.scala b/src/main/scala/gemmini/AccumulatorScale.scala index bf98a144..e4aaa192 100644 --- a/src/main/scala/gemmini/AccumulatorScale.scala +++ b/src/main/scala/gemmini/AccumulatorScale.scala @@ -160,7 +160,7 @@ class AccumulatorScale[T <: Data, U <: Data]( regs(i).valid := false.B } } - head_oh := (head_oh << 1).asUInt() | head_oh(nEntries-1) + head_oh := (head_oh << 1).asUInt | head_oh(nEntries-1) } io.in.ready := !Mux1H(tail_oh.asBools, regs.map(_.valid)) || (tail_oh === head_oh && out.fire) From 0a4517d789cf80224ff228fd314e531b8b6b3349 Mon Sep 17 00:00:00 2001 From: Hasan Genc Date: Tue, 31 Jan 2023 15:57:32 -0800 Subject: [PATCH 22/64] bump chipyard to a version which compiles with 2.13 --- CHIPYARD.hash | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHIPYARD.hash b/CHIPYARD.hash index 7fb91902..654fff8f 100644 --- a/CHIPYARD.hash +++ b/CHIPYARD.hash @@ -1 +1 @@ -004297b6a8c01be1b2110c4cf4f9393ae1ff8805 +c8867beceece64dd682cc603e0ccfa0eb5b1d2a6 From 8bf01ea043b47305014143cb1a8ce449044baa64 Mon Sep 17 00:00:00 2001 From: Jerry Zhao Date: Tue, 31 Jan 2023 16:44:40 -0800 Subject: [PATCH 23/64] Add -f flag to chipyard CI --- .github/scripts/install-gemmini.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/scripts/install-gemmini.sh b/.github/scripts/install-gemmini.sh index 0fa6460d..eaaeb494 100755 --- a/.github/scripts/install-gemmini.sh +++ b/.github/scripts/install-gemmini.sh @@ -23,7 +23,7 @@ cd $LOCAL_CHIPYARD_DIR git fetch git checkout $(cat $LOCAL_CHECKOUT_DIR/CHIPYARD.hash) -./build-setup.sh esp-tools +./build-setup.sh esp-tools -f source env.sh From 50429a57f3919934606c158a04aa282e82a8cdca Mon Sep 17 00:00:00 2001 From: Jerry Zhao Date: Tue, 31 Jan 2023 17:15:51 -0800 Subject: [PATCH 24/64] Skip unncessary parts of chipyard install --- .github/scripts/install-gemmini.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/scripts/install-gemmini.sh b/.github/scripts/install-gemmini.sh index eaaeb494..dba6d32d 100755 --- a/.github/scripts/install-gemmini.sh +++ b/.github/scripts/install-gemmini.sh @@ -23,7 +23,7 @@ cd $LOCAL_CHIPYARD_DIR git fetch git checkout $(cat $LOCAL_CHECKOUT_DIR/CHIPYARD.hash) -./build-setup.sh esp-tools -f +./build-setup.sh esp-tools -f -s 6 -s 7 -s 8 -s 9 source env.sh From 0c1be72547e87a0bd1e950113d5bd58812f8520b Mon Sep 17 00:00:00 2001 From: Jerry Zhao Date: Wed, 1 Feb 2023 00:28:38 -0800 Subject: [PATCH 25/64] Bump gemmini rocc tests --- software/gemmini-rocc-tests | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/software/gemmini-rocc-tests b/software/gemmini-rocc-tests index ae0cd823..211e95f9 160000 --- a/software/gemmini-rocc-tests +++ b/software/gemmini-rocc-tests @@ -1 +1 @@ -Subproject commit ae0cd8236d32fccf7197a7ac0634df5513cec4db +Subproject commit 211e95f961d174788c028cc42a75379585b16c63 From b8848757e0ae67db2629c8ec4bb025cbb0a75319 Mon Sep 17 00:00:00 2001 From: Jerry Zhao Date: Wed, 1 Feb 2023 09:38:40 -0800 Subject: [PATCH 26/64] Only install esp-isa-sim for spike-tests --- .github/scripts/install-gemmini.sh | 4 ---- .github/scripts/run-tests-spike.sh | 4 ++++ 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/scripts/install-gemmini.sh b/.github/scripts/install-gemmini.sh index dba6d32d..ec3c4f75 100755 --- a/.github/scripts/install-gemmini.sh +++ b/.github/scripts/install-gemmini.sh @@ -27,10 +27,6 @@ git checkout $(cat $LOCAL_CHECKOUT_DIR/CHIPYARD.hash) source env.sh -cd toolchains/esp-tools/riscv-isa-sim/build -git checkout $(cat $LOCAL_CHECKOUT_DIR/SPIKE.hash) -make && make install - cd $LOCAL_CHECKOUT_DIR chown -R $(whoami) . git config --global --add safe.directory $LOCAL_CHECKOUT_DIR diff --git a/.github/scripts/run-tests-spike.sh b/.github/scripts/run-tests-spike.sh index 93288a75..cefe5a8e 100755 --- a/.github/scripts/run-tests-spike.sh +++ b/.github/scripts/run-tests-spike.sh @@ -10,6 +10,10 @@ source $SCRIPT_DIR/enable-conda.sh cd $LOCAL_CHIPYARD_DIR source env.sh +cd $LOCAL_CHIPYARD_DIR/toolchains/esp-tools/riscv-isa-sim/build +git checkout $(cat $LOCAL_CHECKOUT_DIR/SPIKE.hash) +make && make install + cd $LOCAL_CHIPYARD_DIR/generators/gemmini/software/gemmini-rocc-tests ./build.sh From 0529232d879c27f4bc7942a1ccdb0c71dd1999f2 Mon Sep 17 00:00:00 2001 From: Jerry Zhao Date: Wed, 1 Feb 2023 09:51:48 -0800 Subject: [PATCH 27/64] Bump gemmini-rocc-tests --- .github/scripts/install-gemmini.sh | 2 +- software/gemmini-rocc-tests | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/scripts/install-gemmini.sh b/.github/scripts/install-gemmini.sh index ec3c4f75..8a89a106 100755 --- a/.github/scripts/install-gemmini.sh +++ b/.github/scripts/install-gemmini.sh @@ -23,7 +23,7 @@ cd $LOCAL_CHIPYARD_DIR git fetch git checkout $(cat $LOCAL_CHECKOUT_DIR/CHIPYARD.hash) -./build-setup.sh esp-tools -f -s 6 -s 7 -s 8 -s 9 +./build-setup.sh riscv-tools -f -s 6 -s 7 -s 8 -s 9 source env.sh diff --git a/software/gemmini-rocc-tests b/software/gemmini-rocc-tests index 211e95f9..61dbeb6a 160000 --- a/software/gemmini-rocc-tests +++ b/software/gemmini-rocc-tests @@ -1 +1 @@ -Subproject commit 211e95f961d174788c028cc42a75379585b16c63 +Subproject commit 61dbeb6a92cf8386e5171da2d97bfe1ad5587151 From 534f8fc2569bb6e0aeb841abb76ae0d97e1eb3bf Mon Sep 17 00:00:00 2001 From: Jerry Zhao Date: Wed, 1 Feb 2023 13:40:11 -0800 Subject: [PATCH 28/64] Fix spike build in CI --- .github/scripts/run-tests-spike.sh | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/scripts/run-tests-spike.sh b/.github/scripts/run-tests-spike.sh index cefe5a8e..c21274f2 100755 --- a/.github/scripts/run-tests-spike.sh +++ b/.github/scripts/run-tests-spike.sh @@ -10,8 +10,12 @@ source $SCRIPT_DIR/enable-conda.sh cd $LOCAL_CHIPYARD_DIR source env.sh -cd $LOCAL_CHIPYARD_DIR/toolchains/esp-tools/riscv-isa-sim/build +git clone git@github.com:ucb-bar/esp-isa-sim.git +cd esp-isa-sim git checkout $(cat $LOCAL_CHECKOUT_DIR/SPIKE.hash) +mkdir build +cd build +../configure --prefix=$RISCV make && make install cd $LOCAL_CHIPYARD_DIR/generators/gemmini/software/gemmini-rocc-tests From 17a03981dd71155a5d5d22d5d62b35178b2b840c Mon Sep 17 00:00:00 2001 From: Jerry Zhao Date: Thu, 2 Feb 2023 14:44:07 -0800 Subject: [PATCH 29/64] Switch to https spike clone in CI --- .github/scripts/run-tests-spike.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/scripts/run-tests-spike.sh b/.github/scripts/run-tests-spike.sh index c21274f2..271b63f3 100755 --- a/.github/scripts/run-tests-spike.sh +++ b/.github/scripts/run-tests-spike.sh @@ -10,7 +10,7 @@ source $SCRIPT_DIR/enable-conda.sh cd $LOCAL_CHIPYARD_DIR source env.sh -git clone git@github.com:ucb-bar/esp-isa-sim.git +git clone https://github.com/ucb-bar/esp-isa-sim.git cd esp-isa-sim git checkout $(cat $LOCAL_CHECKOUT_DIR/SPIKE.hash) mkdir build From aece1a6078e8f93cbbcc7f55a9b9f88e47df1c0b Mon Sep 17 00:00:00 2001 From: Jerry Zhao Date: Fri, 3 Feb 2023 16:35:11 -0800 Subject: [PATCH 30/64] Tie off debug in CI tests --- .github/scripts/do-rtl-build.sh | 2 +- .github/scripts/run-tests-rtl.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/scripts/do-rtl-build.sh b/.github/scripts/do-rtl-build.sh index 38651571..36cbbcbf 100755 --- a/.github/scripts/do-rtl-build.sh +++ b/.github/scripts/do-rtl-build.sh @@ -14,5 +14,5 @@ source env.sh cd $LOCAL_SIM_DIR make -C $LOCAL_SIM_DIR clean -make -j$LOCAL_MAKE_NPROC -C $LOCAL_SIM_DIR VERILATOR_OPT_FLAGS="-O0 -OG" JAVA_OPTS="-Xmx2500M -Xss8M" SBT_OPTS="-Dsbt.ivy.home=$LOCAL_CHIPYARD_DIR/.ivy2 -Dsbt.supershell=false -Dsbt.global.base=$LOCAL_CHIPYARD_DIR/.sbt -Dsbt.boot.directory=$LOCAL_CHIPYARD_DIR/.sbt/boot" CONFIG=GemminiRocketConfig +make -j$LOCAL_MAKE_NPROC -C $LOCAL_SIM_DIR VERILATOR_OPT_FLAGS="-O0 -OG" JAVA_OPTS="-Xmx2500M -Xss8M" SBT_OPTS="-Dsbt.ivy.home=$LOCAL_CHIPYARD_DIR/.ivy2 -Dsbt.supershell=false -Dsbt.global.base=$LOCAL_CHIPYARD_DIR/.sbt -Dsbt.boot.directory=$LOCAL_CHIPYARD_DIR/.sbt/boot" CONFIG=chipyard.harness.WithTiedOffDebug_GemminiRocketConfig diff --git a/.github/scripts/run-tests-rtl.sh b/.github/scripts/run-tests-rtl.sh index 47a87ff1..14bf137b 100755 --- a/.github/scripts/run-tests-rtl.sh +++ b/.github/scripts/run-tests-rtl.sh @@ -14,5 +14,5 @@ cd $LOCAL_CHIPYARD_DIR/generators/gemmini/software/gemmini-rocc-tests CFLAGS=-DFAST ./build.sh cd build -make test-baremetal-bareMetalC RUNNER="'make -C $LOCAL_CHIPYARD_DIR/sims/verilator/ CONFIG=GemminiRocketConfig run-binary-hex BINARY='" +make test-baremetal-bareMetalC RUNNER="'make -C $LOCAL_CHIPYARD_DIR/sims/verilator/ CONFIG=chipyard.harness.WithTiedOffDebug_GemminiRocketConfig run-binary-hex BINARY='" From 3887f5181c91369aa2e1a3aeddbb12858bf2ccc6 Mon Sep 17 00:00:00 2001 From: Jerry Zhao Date: Sat, 4 Feb 2023 11:21:11 -0800 Subject: [PATCH 31/64] Remove debug module entirely from CI --- .github/scripts/defaults.sh | 2 ++ .github/scripts/do-rtl-build.sh | 2 +- .github/scripts/run-tests-rtl.sh | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/scripts/defaults.sh b/.github/scripts/defaults.sh index e403fc89..b49ae66e 100755 --- a/.github/scripts/defaults.sh +++ b/.github/scripts/defaults.sh @@ -30,6 +30,8 @@ LOCAL_SIM_DIR=$LOCAL_CHIPYARD_DIR/sims/verilator LOCAL_VERILATOR_DIR=$HOME/verilator-install LOCAL_CONDA=/opt/conda/ +CICONFIG=chipyard.config.WithNoDebug_GemminiRocketConfig + echo "::set-output name=LOCAL_WORK_DIR::$LOCAL_WORK_DIR" echo "::set-output name=LOCAL_CHECKOUT_DIR::$LOCAL_CHECKOUT_DIR" echo "::set-output name=LOCAL_RISCV_DIR::$LOCAL_RISCV_DIR" diff --git a/.github/scripts/do-rtl-build.sh b/.github/scripts/do-rtl-build.sh index 36cbbcbf..3e26a04f 100755 --- a/.github/scripts/do-rtl-build.sh +++ b/.github/scripts/do-rtl-build.sh @@ -14,5 +14,5 @@ source env.sh cd $LOCAL_SIM_DIR make -C $LOCAL_SIM_DIR clean -make -j$LOCAL_MAKE_NPROC -C $LOCAL_SIM_DIR VERILATOR_OPT_FLAGS="-O0 -OG" JAVA_OPTS="-Xmx2500M -Xss8M" SBT_OPTS="-Dsbt.ivy.home=$LOCAL_CHIPYARD_DIR/.ivy2 -Dsbt.supershell=false -Dsbt.global.base=$LOCAL_CHIPYARD_DIR/.sbt -Dsbt.boot.directory=$LOCAL_CHIPYARD_DIR/.sbt/boot" CONFIG=chipyard.harness.WithTiedOffDebug_GemminiRocketConfig +make -j$LOCAL_MAKE_NPROC -C $LOCAL_SIM_DIR VERILATOR_OPT_FLAGS="-O0 -OG" JAVA_OPTS="-Xmx2500M -Xss8M" SBT_OPTS="-Dsbt.ivy.home=$LOCAL_CHIPYARD_DIR/.ivy2 -Dsbt.supershell=false -Dsbt.global.base=$LOCAL_CHIPYARD_DIR/.sbt -Dsbt.boot.directory=$LOCAL_CHIPYARD_DIR/.sbt/boot" CONFIG=$CICONFIG diff --git a/.github/scripts/run-tests-rtl.sh b/.github/scripts/run-tests-rtl.sh index 14bf137b..e179bd37 100755 --- a/.github/scripts/run-tests-rtl.sh +++ b/.github/scripts/run-tests-rtl.sh @@ -14,5 +14,5 @@ cd $LOCAL_CHIPYARD_DIR/generators/gemmini/software/gemmini-rocc-tests CFLAGS=-DFAST ./build.sh cd build -make test-baremetal-bareMetalC RUNNER="'make -C $LOCAL_CHIPYARD_DIR/sims/verilator/ CONFIG=chipyard.harness.WithTiedOffDebug_GemminiRocketConfig run-binary-hex BINARY='" +make test-baremetal-bareMetalC RUNNER="'make -C $LOCAL_CHIPYARD_DIR/sims/verilator/ CONFIG=$CICONFIG run-binary-hex BINARY='" From ab5d181b91c21ccf6727150070fe6a0cbcbefa6e Mon Sep 17 00:00:00 2001 From: Jerry Zhao Date: Sun, 5 Feb 2023 09:19:16 -0800 Subject: [PATCH 32/64] Add libgemmini submodule --- .gitmodules | 3 +++ software/libgemmini | 1 + 2 files changed, 4 insertions(+) create mode 160000 software/libgemmini diff --git a/.gitmodules b/.gitmodules index 19df446b..fbd4220e 100644 --- a/.gitmodules +++ b/.gitmodules @@ -4,3 +4,6 @@ [submodule "software/onnxruntime-riscv"] path = software/onnxruntime-riscv url = https://github.com/pranav-prakash/onnxruntime-riscv.git +[submodule "software/libgemmini"] + path = software/libgemmini + url = https://github.com/ucb-bar/libgemmini.git diff --git a/software/libgemmini b/software/libgemmini new file mode 160000 index 00000000..144b975a --- /dev/null +++ b/software/libgemmini @@ -0,0 +1 @@ +Subproject commit 144b975a14646b22d23696aadd0b543423182a21 From c1a8807155e19bf3b062e3893d379c0d9b6131f9 Mon Sep 17 00:00:00 2001 From: Jerry Zhao Date: Sun, 5 Feb 2023 09:20:39 -0800 Subject: [PATCH 33/64] Bump chipyard --- CHIPYARD.hash | 2 +- SPIKE.hash | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) delete mode 100644 SPIKE.hash diff --git a/CHIPYARD.hash b/CHIPYARD.hash index 654fff8f..0e76f9a8 100644 --- a/CHIPYARD.hash +++ b/CHIPYARD.hash @@ -1 +1 @@ -c8867beceece64dd682cc603e0ccfa0eb5b1d2a6 +30019f3fd739c49206afafea84b316312832efd8 diff --git a/SPIKE.hash b/SPIKE.hash deleted file mode 100644 index 8cbb8d37..00000000 --- a/SPIKE.hash +++ /dev/null @@ -1 +0,0 @@ -051d820f08be84d069993de4375d29c91eb2f577 From b6389f3ea7bbf070aa3dd40972daa5be7e2d4261 Mon Sep 17 00:00:00 2001 From: Jerry Zhao Date: Sun, 5 Feb 2023 09:24:18 -0800 Subject: [PATCH 34/64] Remove esp-isa-sim install from CI --- .github/scripts/run-tests-spike.sh | 8 -------- 1 file changed, 8 deletions(-) diff --git a/.github/scripts/run-tests-spike.sh b/.github/scripts/run-tests-spike.sh index 271b63f3..93288a75 100755 --- a/.github/scripts/run-tests-spike.sh +++ b/.github/scripts/run-tests-spike.sh @@ -10,14 +10,6 @@ source $SCRIPT_DIR/enable-conda.sh cd $LOCAL_CHIPYARD_DIR source env.sh -git clone https://github.com/ucb-bar/esp-isa-sim.git -cd esp-isa-sim -git checkout $(cat $LOCAL_CHECKOUT_DIR/SPIKE.hash) -mkdir build -cd build -../configure --prefix=$RISCV -make && make install - cd $LOCAL_CHIPYARD_DIR/generators/gemmini/software/gemmini-rocc-tests ./build.sh From 8b2661100e047402edf0b870bd969acf6e6348a4 Mon Sep 17 00:00:00 2001 From: Jerry Zhao Date: Sun, 5 Feb 2023 10:01:02 -0800 Subject: [PATCH 35/64] Bump chipyard --- CHIPYARD.hash | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHIPYARD.hash b/CHIPYARD.hash index 0e76f9a8..84851613 100644 --- a/CHIPYARD.hash +++ b/CHIPYARD.hash @@ -1 +1 @@ -30019f3fd739c49206afafea84b316312832efd8 +e5a734e20a2f4f54db14a190bc0de5efe51f15b2 From b8dcc3ae0b576ec2a17b33254ac181b6bc2f8345 Mon Sep 17 00:00:00 2001 From: Jerry Zhao Date: Sun, 5 Feb 2023 10:01:14 -0800 Subject: [PATCH 36/64] Install our own libgemmini --- .github/scripts/install-gemmini.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/scripts/install-gemmini.sh b/.github/scripts/install-gemmini.sh index 8a89a106..7d4c4ad3 100755 --- a/.github/scripts/install-gemmini.sh +++ b/.github/scripts/install-gemmini.sh @@ -27,6 +27,8 @@ git checkout $(cat $LOCAL_CHECKOUT_DIR/CHIPYARD.hash) source env.sh +make -C $LOCAL_CHECKOUT_DIR/software/libgemmini + cd $LOCAL_CHECKOUT_DIR chown -R $(whoami) . git config --global --add safe.directory $LOCAL_CHECKOUT_DIR From 24079deb6b30eea8941ba511da00d70a2d2fe540 Mon Sep 17 00:00:00 2001 From: Jerry Zhao Date: Sun, 5 Feb 2023 10:36:42 -0800 Subject: [PATCH 37/64] Bump chipyard --- CHIPYARD.hash | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHIPYARD.hash b/CHIPYARD.hash index 84851613..747b6b18 100644 --- a/CHIPYARD.hash +++ b/CHIPYARD.hash @@ -1 +1 @@ -e5a734e20a2f4f54db14a190bc0de5efe51f15b2 +bcbe3b7f1f40d1c388aca68df498fd7dd4d16e89 From 75c534aa2586aa9d2bb62edaf2e1fcf9bd85135b Mon Sep 17 00:00:00 2001 From: Jerry Zhao Date: Sun, 5 Feb 2023 11:06:59 -0800 Subject: [PATCH 38/64] Fix install-gemmini script --- .github/scripts/install-gemmini.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/scripts/install-gemmini.sh b/.github/scripts/install-gemmini.sh index 7d4c4ad3..dac96eea 100755 --- a/.github/scripts/install-gemmini.sh +++ b/.github/scripts/install-gemmini.sh @@ -27,7 +27,9 @@ git checkout $(cat $LOCAL_CHECKOUT_DIR/CHIPYARD.hash) source env.sh -make -C $LOCAL_CHECKOUT_DIR/software/libgemmini +cd $LOCAL_CHECKOUT_DIR +git submodule update --init software/libgemmini +make -C software/libgemmini cd $LOCAL_CHECKOUT_DIR chown -R $(whoami) . From 89a5c8f84c8c2fcd233627ba5c2653222562d620 Mon Sep 17 00:00:00 2001 From: Jerry Zhao Date: Sun, 5 Feb 2023 11:46:43 -0800 Subject: [PATCH 39/64] Fix git dubious ownership warning --- .github/scripts/install-gemmini.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/scripts/install-gemmini.sh b/.github/scripts/install-gemmini.sh index dac96eea..37260e05 100755 --- a/.github/scripts/install-gemmini.sh +++ b/.github/scripts/install-gemmini.sh @@ -28,6 +28,7 @@ git checkout $(cat $LOCAL_CHECKOUT_DIR/CHIPYARD.hash) source env.sh cd $LOCAL_CHECKOUT_DIR +git config --global --add safe.directory $LOCAL_CHECKOUT_DIR git submodule update --init software/libgemmini make -C software/libgemmini From fbe4ec19911e722bea4077f3f3c761ef405820ab Mon Sep 17 00:00:00 2001 From: Hasan Genc Date: Sun, 5 Feb 2023 13:54:42 -0800 Subject: [PATCH 40/64] try to fix safe-directory errors --- .github/scripts/install-gemmini.sh | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/scripts/install-gemmini.sh b/.github/scripts/install-gemmini.sh index 37260e05..b98e6250 100755 --- a/.github/scripts/install-gemmini.sh +++ b/.github/scripts/install-gemmini.sh @@ -28,13 +28,15 @@ git checkout $(cat $LOCAL_CHECKOUT_DIR/CHIPYARD.hash) source env.sh cd $LOCAL_CHECKOUT_DIR +chown -R $(whoami) . git config --global --add safe.directory $LOCAL_CHECKOUT_DIR +git config --global --add safe.directory '*' + +cd $LOCAL_CHECKOUT_DIR git submodule update --init software/libgemmini make -C software/libgemmini cd $LOCAL_CHECKOUT_DIR -chown -R $(whoami) . -git config --global --add safe.directory $LOCAL_CHECKOUT_DIR git submodule update --init --recursive software/gemmini-rocc-tests rm -rf $LOCAL_CHIPYARD_DIR/generators/gemmini/* $LOCAL_CHIPYARD_DIR/generators/gemmini/.git* mv -f $LOCAL_CHECKOUT_DIR/* $LOCAL_CHECKOUT_DIR/.git* $LOCAL_CHIPYARD_DIR/generators/gemmini/ From 43a2ebcaf08e555171185f70a0ac8511eebdcba3 Mon Sep 17 00:00:00 2001 From: Jerry Zhao Date: Sun, 5 Feb 2023 20:32:06 -0800 Subject: [PATCH 41/64] Bump libgemmini --- software/libgemmini | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/software/libgemmini b/software/libgemmini index 144b975a..4be22079 160000 --- a/software/libgemmini +++ b/software/libgemmini @@ -1 +1 @@ -Subproject commit 144b975a14646b22d23696aadd0b543423182a21 +Subproject commit 4be220794cfdb834e8ecc2ee7becdf8632cc268c From 8c33285c5a15aa60929e300b7ec766f4ef799c4e Mon Sep 17 00:00:00 2001 From: Jerry Zhao Date: Sun, 5 Feb 2023 22:41:22 -0800 Subject: [PATCH 42/64] Delete stale libgemmini before installing in CI --- .github/scripts/install-gemmini.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/scripts/install-gemmini.sh b/.github/scripts/install-gemmini.sh index b98e6250..1acab7b9 100755 --- a/.github/scripts/install-gemmini.sh +++ b/.github/scripts/install-gemmini.sh @@ -33,6 +33,8 @@ git config --global --add safe.directory $LOCAL_CHECKOUT_DIR git config --global --add safe.directory '*' cd $LOCAL_CHECKOUT_DIR +# Delete the stale libgemmini first installed by chipyard, switch to the one submoduled here +rm -rf $RISCV/lib/libgemmini.so git submodule update --init software/libgemmini make -C software/libgemmini From 6dd1ae9ddd084acc2506f14f0510e3244f25b30d Mon Sep 17 00:00:00 2001 From: Jerry Zhao Date: Sun, 5 Feb 2023 23:37:15 -0800 Subject: [PATCH 43/64] install libgemmini in run-tests-spike for ci --- .github/scripts/install-gemmini.sh | 6 ------ .github/scripts/run-tests-spike.sh | 11 +++++++++++ 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/.github/scripts/install-gemmini.sh b/.github/scripts/install-gemmini.sh index 1acab7b9..daf2d891 100755 --- a/.github/scripts/install-gemmini.sh +++ b/.github/scripts/install-gemmini.sh @@ -32,12 +32,6 @@ chown -R $(whoami) . git config --global --add safe.directory $LOCAL_CHECKOUT_DIR git config --global --add safe.directory '*' -cd $LOCAL_CHECKOUT_DIR -# Delete the stale libgemmini first installed by chipyard, switch to the one submoduled here -rm -rf $RISCV/lib/libgemmini.so -git submodule update --init software/libgemmini -make -C software/libgemmini - cd $LOCAL_CHECKOUT_DIR git submodule update --init --recursive software/gemmini-rocc-tests rm -rf $LOCAL_CHIPYARD_DIR/generators/gemmini/* $LOCAL_CHIPYARD_DIR/generators/gemmini/.git* diff --git a/.github/scripts/run-tests-spike.sh b/.github/scripts/run-tests-spike.sh index 93288a75..9a2d8ede 100755 --- a/.github/scripts/run-tests-spike.sh +++ b/.github/scripts/run-tests-spike.sh @@ -10,6 +10,17 @@ source $SCRIPT_DIR/enable-conda.sh cd $LOCAL_CHIPYARD_DIR source env.sh +cd $LOCAL_CHECKOUT_DIR +chown -R $(whoami) . +git config --global --add safe.directory $LOCAL_CHECKOUT_DIR +git config --global --add safe.directory '*' + +cd $LOCAL_CHECKOUT_DIR +# Delete the stale libgemmini first installed by chipyard, switch to the one submoduled here +rm -rf $RISCV/lib/libgemmini.so +git submodule update --init software/libgemmini +make -C software/libgemmini + cd $LOCAL_CHIPYARD_DIR/generators/gemmini/software/gemmini-rocc-tests ./build.sh From 1d66652bb400f61bd2a1f6cdc08ab8a73dcd32c2 Mon Sep 17 00:00:00 2001 From: Jerry Zhao Date: Mon, 6 Feb 2023 09:41:04 -0800 Subject: [PATCH 44/64] Actually install our libgemmini in CI --- .github/scripts/run-tests-spike.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/scripts/run-tests-spike.sh b/.github/scripts/run-tests-spike.sh index 9a2d8ede..41774cfd 100755 --- a/.github/scripts/run-tests-spike.sh +++ b/.github/scripts/run-tests-spike.sh @@ -19,7 +19,7 @@ cd $LOCAL_CHECKOUT_DIR # Delete the stale libgemmini first installed by chipyard, switch to the one submoduled here rm -rf $RISCV/lib/libgemmini.so git submodule update --init software/libgemmini -make -C software/libgemmini +make -C software/libgemmini install cd $LOCAL_CHIPYARD_DIR/generators/gemmini/software/gemmini-rocc-tests ./build.sh From 7c8657cf20aad1f76f4dc1b216dc35b9156c6768 Mon Sep 17 00:00:00 2001 From: Jerry Zhao Date: Tue, 7 Feb 2023 09:43:03 -0800 Subject: [PATCH 45/64] [ci skip] Update README/scripts with new isa-sim instructions --- README.md | 14 ++++---------- scripts/build-spike.sh | 6 +++--- 2 files changed, 7 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index 5f310564..4caf35bb 100644 --- a/README.md +++ b/README.md @@ -33,7 +33,7 @@ Run these steps to install Chipyard and Spike (make sure to checkout the correct git clone https://github.com/ucb-bar/chipyard.git cd chipyard git checkout 1.8.1 -./build-setup.sh esp-tools +./build-setup.sh riscv-tools source env.sh @@ -42,12 +42,7 @@ git config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*" git checkout dev && git pull origin dev git submodule update --init --recursive -SPIKE_HASH=$(cat SPIKE.hash) - -cd - -cd toolchains/esp-tools/riscv-isa-sim/build -git fetch && git checkout $SPIKE_HASH -make && make install +make -C software/libgemmini install # The final step is only necessary if you want to run MIDAS simulations with # realistic DRAM models @@ -368,9 +363,8 @@ Afterwards, the test binaries will be found in `software/gemmini-rocc-tests/buil Binaries whose names end in `-baremetal` are meant to be run in a bare-metal environment, while binaries whose names end in `-linux` are meant to run in a Linux environment. You can run the tests either on a cycle-accurate RTL simulator, or on a (much faster) functional ISA simulator called Spike. -We use a special fork of Spike, found [here](https://github.com/ucb-bar/esp-isa-sim), which has support for Gemmini instructions. -(You can find the required commit hash in `SPIKE.hash`). -If you are using Chipyard, you can easily build Spike by running `./scripts/build-toolchains.sh esp-tools` from Chipyard's root directory. +We use a special extension of Spike, found [here](https://github.com/ucb-bar/libgemmini), which has support for Gemmini instructions. +If you are using Chipyard, you can easily build Spike by running `./scripts/build-toolchains.sh riscv-tools` from Chipyard's root directory, then by running `make -C software/libgemmini install` in the Gemmini directory. Then, to run the `mvin_mvout` test, which simply moves a matrix into Gemmini's scratchpad before moving it back out into main memory, run the following commands: ```shell diff --git a/scripts/build-spike.sh b/scripts/build-spike.sh index f7b1baf2..1df963ab 100755 --- a/scripts/build-spike.sh +++ b/scripts/build-spike.sh @@ -24,6 +24,6 @@ echo Generating new gemmini_params.h file... make verilog CONFIG=CustomGemminiSoCConfig &> build.log cd - -cp software/gemmini-rocc-tests/include/gemmini_params.h ../../toolchains/esp-tools/riscv-isa-sim/gemmini/gemmini_params.h -cd ../../toolchains/esp-tools/riscv-isa-sim/build -make && make install +cp software/gemmini-rocc-tests/include/gemmini_params.h software/libgemmini/gemmini_params.h +make -C software/libgemmini clean +make -C software/libgemmini install From 074a880df20a2a3bb0b95ddb707aae8dd06ed324 Mon Sep 17 00:00:00 2001 From: Jamie Hong <27294970+curiousdragon@users.noreply.github.com> Date: Sat, 11 Feb 2023 08:55:02 -0800 Subject: [PATCH 46/64] Update gemmini-rocc-tests to support rectangular convolutions (#278) --------- Co-authored-by: Hasan Genc --- software/gemmini-rocc-tests | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/software/gemmini-rocc-tests b/software/gemmini-rocc-tests index 61dbeb6a..bc3cabb4 160000 --- a/software/gemmini-rocc-tests +++ b/software/gemmini-rocc-tests @@ -1 +1 @@ -Subproject commit 61dbeb6a92cf8386e5171da2d97bfe1ad5587151 +Subproject commit bc3cabb4081e3dbaa1d33119f0471357f2aa3927 From 9e478ecce9e48bbc03b9bd3535d71e03a6269fba Mon Sep 17 00:00:00 2001 From: Jerry Zhao Date: Wed, 22 Feb 2023 11:07:28 -0800 Subject: [PATCH 47/64] Avoid depending on testchipip --- src/main/scala/gemmini/DMA.scala | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src/main/scala/gemmini/DMA.scala b/src/main/scala/gemmini/DMA.scala index 3744282c..6d87a01e 100644 --- a/src/main/scala/gemmini/DMA.scala +++ b/src/main/scala/gemmini/DMA.scala @@ -8,8 +8,7 @@ import chisel3.experimental.DataMirror import freechips.rocketchip.config.Parameters import freechips.rocketchip.diplomacy.{IdRange, LazyModule, LazyModuleImp} import freechips.rocketchip.tile.{CoreBundle, HasCoreParameters} -import freechips.rocketchip.tilelink.TLBundleA -import testchipip.TLHelper +import freechips.rocketchip.tilelink._ import freechips.rocketchip.rocket.MStatus import freechips.rocketchip.rocket.constants.MemoryOpConstants @@ -128,8 +127,8 @@ class StreamReaderCore[T <: Data, U <: Data, V <: Data](config: GemminiArrayConf use_tlb_register_filter: Boolean, use_firesim_simulation_counters: Boolean) (implicit p: Parameters) extends LazyModule { - val node = TLHelper.makeClientNode( - name = "stream-reader", sourceId = IdRange(0, nXacts)) + val node = TLClientNode(Seq(TLMasterPortParameters.v1(Seq(TLClientParameters( + name = "stream-reader", sourceId = IdRange(0, nXacts)))))) require(isPow2(aligned_to)) @@ -350,8 +349,8 @@ class StreamWriter[T <: Data: Arithmetic](nXacts: Int, beatBits: Int, maxBytes: inputType: T, block_cols: Int, use_tlb_register_filter: Boolean, use_firesim_simulation_counters: Boolean) (implicit p: Parameters) extends LazyModule { - val node = TLHelper.makeClientNode( - name = "stream-writer", sourceId = IdRange(0, nXacts)) + val node = TLClientNode(Seq(TLMasterPortParameters.v1(Seq(TLClientParameters( + name = "stream-writer", sourceId = IdRange(0, nXacts)))))) require(isPow2(aligned_to)) From 1a8eaba5129afeed39147a833e9265079c151816 Mon Sep 17 00:00:00 2001 From: Jerry Zhao Date: Sat, 11 Mar 2023 10:32:29 -0800 Subject: [PATCH 48/64] Bump software to get gcc12 fix --- software/gemmini-rocc-tests | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/software/gemmini-rocc-tests b/software/gemmini-rocc-tests index bc3cabb4..4a2f0061 160000 --- a/software/gemmini-rocc-tests +++ b/software/gemmini-rocc-tests @@ -1 +1 @@ -Subproject commit bc3cabb4081e3dbaa1d33119f0471357f2aa3927 +Subproject commit 4a2f00614d390baeb996919cb2bbc6d2b307b918 From a916bfb1a2bccb4a7caef289e22c5a1068cba54a Mon Sep 17 00:00:00 2001 From: abejgonzalez Date: Sat, 11 Mar 2023 22:29:21 -0800 Subject: [PATCH 49/64] Add LeanGemminiConfig for tutorial --- src/main/scala/gemmini/Configs.scala | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/src/main/scala/gemmini/Configs.scala b/src/main/scala/gemmini/Configs.scala index bd84b317..19fc1735 100644 --- a/src/main/scala/gemmini/Configs.scala +++ b/src/main/scala/gemmini/Configs.scala @@ -1,4 +1,3 @@ - package gemmini import chisel3._ @@ -256,6 +255,21 @@ class DefaultGemminiConfig[T <: Data : Arithmetic, U <: Data, V <: Data]( ) }) +/** + * Mixin which sets the default lean parameters for a systolic array accelerator. + */ +class LeanGemminiConfig[T <: Data : Arithmetic, U <: Data, V <: Data]( + gemminiConfig: GemminiArrayConfig[T,U,V] = GemminiConfigs.leanConfig +) extends Config((site, here, up) => { + case BuildRoCC => up(BuildRoCC) ++ Seq( + (p: Parameters) => { + implicit val q = p + val gemmini = LazyModule(new Gemmini(gemminiConfig)) + gemmini + } + ) +}) + // This Gemmini config has both an Int and an FP Gemmini side-by-side, sharing // the same scratchpad. class DualGemminiConfig extends Config((site, here, up) => { From e507416c38f63cd19eac9bf8c11c5087b3ba2624 Mon Sep 17 00:00:00 2001 From: Hasan Genc Date: Sun, 12 Mar 2023 12:52:42 -0700 Subject: [PATCH 50/64] bump gemmini-rocc-tests --- software/gemmini-rocc-tests | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/software/gemmini-rocc-tests b/software/gemmini-rocc-tests index 4a2f0061..b5e531e9 160000 --- a/software/gemmini-rocc-tests +++ b/software/gemmini-rocc-tests @@ -1 +1 @@ -Subproject commit 4a2f00614d390baeb996919cb2bbc6d2b307b918 +Subproject commit b5e531e98303b1ad98386cd4b82262a68e1d4ce6 From 5abbe54a0d6f8b3c86699bd559d082df2912ed41 Mon Sep 17 00:00:00 2001 From: Hasan Genc Date: Sun, 12 Mar 2023 19:03:29 -0700 Subject: [PATCH 51/64] bump gemmini-rocc-tests to dev --- software/gemmini-rocc-tests | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/software/gemmini-rocc-tests b/software/gemmini-rocc-tests index b5e531e9..13e7e1fc 160000 --- a/software/gemmini-rocc-tests +++ b/software/gemmini-rocc-tests @@ -1 +1 @@ -Subproject commit b5e531e98303b1ad98386cd4b82262a68e1d4ce6 +Subproject commit 13e7e1fce1a8d332eea563c14130136ef0533b16 From 686cb15dad756887db62460968bc616392bc4341 Mon Sep 17 00:00:00 2001 From: Sagar Karandikar Date: Thu, 23 Mar 2023 01:59:03 +0000 Subject: [PATCH 52/64] add printf config for tutorials --- src/main/scala/gemmini/Configs.scala | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/src/main/scala/gemmini/Configs.scala b/src/main/scala/gemmini/Configs.scala index 19fc1735..8db9a0a7 100644 --- a/src/main/scala/gemmini/Configs.scala +++ b/src/main/scala/gemmini/Configs.scala @@ -236,6 +236,9 @@ object GemminiConfigs { ) val leanConfig = defaultConfig.copy(dataflow=Dataflow.WS, max_in_flight_mem_reqs = 64, acc_read_full_width = false, ex_read_from_acc = false, ex_write_to_spad = false, hardcode_d_to_garbage_addr = true) + + val leanPrintfConfig = defaultConfig.copy(dataflow=Dataflow.WS, max_in_flight_mem_reqs = 64, acc_read_full_width = false, ex_read_from_acc = false, ex_write_to_spad = false, hardcode_d_to_garbage_addr = true, use_firesim_simulation_counters=true) + } /** @@ -270,6 +273,18 @@ class LeanGemminiConfig[T <: Data : Arithmetic, U <: Data, V <: Data]( ) }) +class LeanGemminiPrintfConfig[T <: Data : Arithmetic, U <: Data, V <: Data]( + gemminiConfig: GemminiArrayConfig[T,U,V] = GemminiConfigs.leanPrintfConfig +) extends Config((site, here, up) => { + case BuildRoCC => up(BuildRoCC) ++ Seq( + (p: Parameters) => { + implicit val q = p + val gemmini = LazyModule(new Gemmini(gemminiConfig)) + gemmini + } + ) +}) + // This Gemmini config has both an Int and an FP Gemmini side-by-side, sharing // the same scratchpad. class DualGemminiConfig extends Config((site, here, up) => { From a9a92eabf77fff200dd3bebf84e83bd22858f178 Mon Sep 17 00:00:00 2001 From: Seah <54855793+SeahK@users.noreply.github.com> Date: Mon, 27 Mar 2023 16:46:29 -0700 Subject: [PATCH 53/64] add dummy default config --- src/main/scala/gemmini/Configs.scala | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/src/main/scala/gemmini/Configs.scala b/src/main/scala/gemmini/Configs.scala index 19fc1735..9e7c4b99 100644 --- a/src/main/scala/gemmini/Configs.scala +++ b/src/main/scala/gemmini/Configs.scala @@ -172,15 +172,15 @@ object GemminiConfigs { meshRows = defaultConfig.meshRows, meshColumns = defaultConfig.meshColumns, dataflow = defaultConfig.dataflow, - sp_capacity = defaultConfig.sp_capacity, - acc_capacity = defaultConfig.acc_capacity, + sp_capacity = CapacityInKilobytes(128), + acc_capacity = CapacityInKilobytes(128), sp_banks = defaultConfig.sp_banks, acc_banks = defaultConfig.acc_banks, sp_singleported = defaultConfig.sp_singleported, acc_singleported = defaultConfig.acc_singleported, - has_training_convs = defaultConfig.has_training_convs, + has_training_convs = false, has_max_pool = defaultConfig.has_max_pool, - has_nonlinear_activations = defaultConfig.has_nonlinear_activations, + has_nonlinear_activations = false, reservation_station_entries_ld = defaultConfig.reservation_station_entries_ld, reservation_station_entries_st = defaultConfig.reservation_station_entries_st, reservation_station_entries_ex = defaultConfig.reservation_station_entries_ex, @@ -209,14 +209,14 @@ object GemminiConfigs { c_str = "({float y = ROUND_NEAR_EVEN((x) * (scale)); y > INT8_MAX ? INT8_MAX : (y < INT8_MIN ? INT8_MIN : (acc_t)y);})" )), - num_counter = defaultConfig.num_counter, + num_counter = 0, - acc_read_full_width = defaultConfig.acc_read_full_width, + acc_read_full_width = false, acc_read_small_width = defaultConfig.acc_read_small_width, ex_read_from_spad = defaultConfig.ex_read_from_spad, - ex_read_from_acc = defaultConfig.ex_read_from_acc, - ex_write_to_spad = defaultConfig.ex_write_to_spad, + ex_read_from_acc = false, + ex_write_to_spad = false, ex_write_to_acc = defaultConfig.ex_write_to_acc, ) @@ -270,6 +270,18 @@ class LeanGemminiConfig[T <: Data : Arithmetic, U <: Data, V <: Data]( ) }) +class DummyDefaultGemminiConfig[T <: Data : Arithmetic, U <: Data, V <: Data]( + gemminiConfig: GemminiArrayConfig[T,U,V] = GemminiConfigs.dummyConfig +) extends Config((site, here, up) => { + case BuildRoCC => up(BuildRoCC) ++ Seq( + (p: Parameters) => { + implicit val q = p + val gemmini = LazyModule(new Gemmini(gemminiConfig)) + gemmini + } + ) +}) + // This Gemmini config has both an Int and an FP Gemmini side-by-side, sharing // the same scratchpad. class DualGemminiConfig extends Config((site, here, up) => { From 8f5f6d556d1e67daecd6e5b75ff5c327e767d77e Mon Sep 17 00:00:00 2001 From: Jerry Zhao Date: Wed, 29 Mar 2023 11:47:51 -0700 Subject: [PATCH 54/64] Fix typo --- src/main/scala/gemmini/Configs.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/scala/gemmini/Configs.scala b/src/main/scala/gemmini/Configs.scala index 960dac4d..b0d73764 100644 --- a/src/main/scala/gemmini/Configs.scala +++ b/src/main/scala/gemmini/Configs.scala @@ -273,7 +273,7 @@ class LeanGemminiConfig[T <: Data : Arithmetic, U <: Data, V <: Data]( ) }) -class LeanGemminiPrintfConfig[T <: Data : Arithmetic, U <: Data, V <: Data] +class LeanGemminiPrintfConfig[T <: Data : Arithmetic, U <: Data, V <: Data]( gemminiConfig: GemminiArrayConfig[T,U,V] = GemminiConfigs.leanPrintfConfig ) extends Config((site, here, up) => { case BuildRoCC => up(BuildRoCC) ++ Seq( From 4dd19f3e93d2decc24048eb1776e53fb20855841 Mon Sep 17 00:00:00 2001 From: abejgonzalez Date: Thu, 30 Mar 2023 23:19:56 -0700 Subject: [PATCH 55/64] Add smoke test | Support Gemmini spike --- software/gemmini-ort.json | 4 ++-- software/gemmini-smoke.json | 10 ++++++++++ software/gemmini-tests-full.json | 3 ++- software/gemmini-tests-interactive.json | 3 ++- software/gemmini-tests.json | 3 ++- software/overlay/root/run-test-smoke.sh | 9 +++++++++ 6 files changed, 27 insertions(+), 5 deletions(-) create mode 100644 software/gemmini-smoke.json create mode 100755 software/overlay/root/run-test-smoke.sh diff --git a/software/gemmini-ort.json b/software/gemmini-ort.json index c4a95253..a04c2869 100644 --- a/software/gemmini-ort.json +++ b/software/gemmini-ort.json @@ -53,6 +53,6 @@ ], "overlay": "../onnxruntime-riscv/systolic_runner/imagenet_runner", "rootfs-size": "16GiB", - "run": "run-ort.sh" + "run": "run-ort.sh", + "spike-args": "--extension=gemmini" } - diff --git a/software/gemmini-smoke.json b/software/gemmini-smoke.json new file mode 100644 index 00000000..cdfb4d13 --- /dev/null +++ b/software/gemmini-smoke.json @@ -0,0 +1,10 @@ +{ + "name" : "gemmini-smoke", + "workdir" : ".", + "base" : "br-base.json", + "overlay" : "overlay", + "host-init" : "host-init.sh", + "command": "/root/run-test-smoke.sh", + "rootfs-size" : "16GiB", + "spike-args" : "--extension=gemmini" +} diff --git a/software/gemmini-tests-full.json b/software/gemmini-tests-full.json index c4c56d29..6acfced1 100644 --- a/software/gemmini-tests-full.json +++ b/software/gemmini-tests-full.json @@ -4,5 +4,6 @@ "base" : "br-base.json", "overlay" : "overlay", "host-init" : "host-init.sh", - "command": "/root/run-tests-full.sh" + "command": "/root/run-tests-full.sh", + "spike-args": "--extension=gemmini" } diff --git a/software/gemmini-tests-interactive.json b/software/gemmini-tests-interactive.json index 0fe52409..72eca491 100644 --- a/software/gemmini-tests-interactive.json +++ b/software/gemmini-tests-interactive.json @@ -4,5 +4,6 @@ "base" : "br-base.json", "overlay" : "overlay", "host-init" : "host-init.sh", - "rootfs-size" : "16GiB" + "rootfs-size" : "16GiB", + "spike-args": "--extension=gemmini" } diff --git a/software/gemmini-tests.json b/software/gemmini-tests.json index fc0e45a9..754e35ed 100644 --- a/software/gemmini-tests.json +++ b/software/gemmini-tests.json @@ -5,5 +5,6 @@ "overlay" : "overlay", "host-init" : "host-init.sh", "command": "/root/run-tests.sh", - "rootfs-size" : "16GiB" + "rootfs-size" : "16GiB", + "spike-args" : "--extension=gemmini" } diff --git a/software/overlay/root/run-test-smoke.sh b/software/overlay/root/run-test-smoke.sh new file mode 100755 index 00000000..7ba04632 --- /dev/null +++ b/software/overlay/root/run-test-smoke.sh @@ -0,0 +1,9 @@ +#!/usr/bin/env bash + +echo "*****************TEST RESULTS*************" > test_output.txt + +echo "========mobilenet=========" +/root/imagenet/mobilenet-linux >> test_output.txt + +cat test_output.txt +poweroff -f From e1f7eaa9292a02bb8e6699fc545c0400f065ebee Mon Sep 17 00:00:00 2001 From: Jerry Zhao Date: Mon, 20 Mar 2023 19:14:32 -0700 Subject: [PATCH 56/64] Bump to latest rocket-chip/chisel3.5.6 --- build.sbt | 4 ++-- src/main/scala/gemmini/CmdFSM.scala | 2 +- src/main/scala/gemmini/Configs.scala | 2 +- src/main/scala/gemmini/ConfigsFP.scala | 2 +- src/main/scala/gemmini/Controller.scala | 2 +- src/main/scala/gemmini/CounterFile.scala | 2 +- src/main/scala/gemmini/CustomConfigs.scala | 2 +- src/main/scala/gemmini/CustomSoCConfigs.scala | 2 +- src/main/scala/gemmini/DMA.scala | 2 +- src/main/scala/gemmini/DSEConfigs.scala | 2 +- src/main/scala/gemmini/ExecuteController.scala | 2 +- src/main/scala/gemmini/FrontendTLB.scala | 2 +- src/main/scala/gemmini/InstructionCompression.scala | 2 +- src/main/scala/gemmini/LoadController.scala | 2 +- src/main/scala/gemmini/LoopConv.scala | 2 +- src/main/scala/gemmini/LoopMatmul.scala | 2 +- src/main/scala/gemmini/LoopUnroller.scala | 2 +- src/main/scala/gemmini/Scratchpad.scala | 2 +- src/main/scala/gemmini/StoreController.scala | 2 +- src/main/scala/gemmini/TilerController.scala | 2 +- src/main/scala/gemmini/TilerFSM.scala | 2 +- src/main/scala/gemmini/TilerScheduler.scala | 2 +- src/main/scala/gemmini/TransposePreloadUnroller.scala | 2 +- 23 files changed, 24 insertions(+), 24 deletions(-) diff --git a/build.sbt b/build.sbt index de8bd4e5..80242277 100644 --- a/build.sbt +++ b/build.sbt @@ -7,9 +7,9 @@ version := "3.1.0" scalaVersion := "2.13.10" libraryDependencies ++= Seq( - "edu.berkeley.cs" %% "chisel3" % "3.4.+", + "edu.berkeley.cs" %% "chisel3" % "3.5.6", "edu.berkeley.cs" %% "rocketchip" % "1.2.+", - "edu.berkeley.cs" %% "chisel-iotesters" % "1.5.+", + "edu.berkeley.cs" %% "chisel-iotesters" % "2.5.6", "org.scalanlp" %% "breeze" % "1.1") resolvers ++= Seq( diff --git a/src/main/scala/gemmini/CmdFSM.scala b/src/main/scala/gemmini/CmdFSM.scala index 8ee3a696..e5fdb832 100644 --- a/src/main/scala/gemmini/CmdFSM.scala +++ b/src/main/scala/gemmini/CmdFSM.scala @@ -9,7 +9,7 @@ package gemmini import chisel3._ import chisel3.util._ import chisel3.experimental._ -import freechips.rocketchip.config._ +import org.chipsalliance.cde.config._ import freechips.rocketchip.rocket._ import freechips.rocketchip.tile._ import GemminiISA._ diff --git a/src/main/scala/gemmini/Configs.scala b/src/main/scala/gemmini/Configs.scala index b0d73764..a849883d 100644 --- a/src/main/scala/gemmini/Configs.scala +++ b/src/main/scala/gemmini/Configs.scala @@ -1,7 +1,7 @@ package gemmini import chisel3._ -import freechips.rocketchip.config.{Config, Parameters} +import org.chipsalliance.cde.config.{Config, Parameters} import freechips.rocketchip.diplomacy.LazyModule import freechips.rocketchip.subsystem._ import freechips.rocketchip.tile.{BuildRoCC, OpcodeSet, XLen} diff --git a/src/main/scala/gemmini/ConfigsFP.scala b/src/main/scala/gemmini/ConfigsFP.scala index c76907dd..ceb9d2a0 100644 --- a/src/main/scala/gemmini/ConfigsFP.scala +++ b/src/main/scala/gemmini/ConfigsFP.scala @@ -1,7 +1,7 @@ package gemmini import chisel3._ -import freechips.rocketchip.config.{Config, Parameters} +import org.chipsalliance.cde.config.{Config, Parameters} import freechips.rocketchip.diplomacy.{LazyModule, ValName} import freechips.rocketchip.subsystem._ import freechips.rocketchip.tile.{BuildRoCC, OpcodeSet} diff --git a/src/main/scala/gemmini/Controller.scala b/src/main/scala/gemmini/Controller.scala index d4c5f7d5..0fdda55f 100644 --- a/src/main/scala/gemmini/Controller.scala +++ b/src/main/scala/gemmini/Controller.scala @@ -6,7 +6,7 @@ import java.nio.file.{Files, Paths} import chisel3._ import chisel3.util._ -import freechips.rocketchip.config._ +import org.chipsalliance.cde.config._ import freechips.rocketchip.diplomacy._ import freechips.rocketchip.tile._ import freechips.rocketchip.util.ClockGate diff --git a/src/main/scala/gemmini/CounterFile.scala b/src/main/scala/gemmini/CounterFile.scala index 7b28b8e2..055ab1f3 100644 --- a/src/main/scala/gemmini/CounterFile.scala +++ b/src/main/scala/gemmini/CounterFile.scala @@ -2,7 +2,7 @@ package gemmini import chisel3._ import chisel3.util._ -import freechips.rocketchip.config._ +import org.chipsalliance.cde.config._ import freechips.rocketchip.diplomacy._ import freechips.rocketchip.tile._ import freechips.rocketchip.tilelink.{TLIdentityNode} diff --git a/src/main/scala/gemmini/CustomConfigs.scala b/src/main/scala/gemmini/CustomConfigs.scala index ae529a69..011d7ce1 100644 --- a/src/main/scala/gemmini/CustomConfigs.scala +++ b/src/main/scala/gemmini/CustomConfigs.scala @@ -1,6 +1,6 @@ package gemmini -import chipsalliance.rocketchip.config.{Config, Parameters} +import org.chipsalliance.cde.config.{Config, Parameters} import chisel3._ import freechips.rocketchip.diplomacy.LazyModule import freechips.rocketchip.subsystem.SystemBusKey diff --git a/src/main/scala/gemmini/CustomSoCConfigs.scala b/src/main/scala/gemmini/CustomSoCConfigs.scala index 057aa1e1..58636db1 100644 --- a/src/main/scala/gemmini/CustomSoCConfigs.scala +++ b/src/main/scala/gemmini/CustomSoCConfigs.scala @@ -1,7 +1,7 @@ /* package chipyard -import freechips.rocketchip.config.{Config} +import org.chipsalliance.cde.config.{Config} class CustomGemminiSoCConfig extends Config( new gemmini.GemminiCustomConfig ++ diff --git a/src/main/scala/gemmini/DMA.scala b/src/main/scala/gemmini/DMA.scala index 6d87a01e..1fd0be82 100644 --- a/src/main/scala/gemmini/DMA.scala +++ b/src/main/scala/gemmini/DMA.scala @@ -5,7 +5,7 @@ import chisel3._ import chisel3.util._ import chisel3.experimental.DataMirror -import freechips.rocketchip.config.Parameters +import org.chipsalliance.cde.config.Parameters import freechips.rocketchip.diplomacy.{IdRange, LazyModule, LazyModuleImp} import freechips.rocketchip.tile.{CoreBundle, HasCoreParameters} import freechips.rocketchip.tilelink._ diff --git a/src/main/scala/gemmini/DSEConfigs.scala b/src/main/scala/gemmini/DSEConfigs.scala index 257721ca..26b8c3aa 100644 --- a/src/main/scala/gemmini/DSEConfigs.scala +++ b/src/main/scala/gemmini/DSEConfigs.scala @@ -2,7 +2,7 @@ package gemmini import chisel3._ -import freechips.rocketchip.config.{Config, Parameters} +import org.chipsalliance.cde.config.{Config, Parameters} import freechips.rocketchip.diplomacy.{LazyModule, ValName} import freechips.rocketchip.subsystem._ import freechips.rocketchip.tile.{BuildRoCC, OpcodeSet} diff --git a/src/main/scala/gemmini/ExecuteController.scala b/src/main/scala/gemmini/ExecuteController.scala index 65add720..13f45eea 100644 --- a/src/main/scala/gemmini/ExecuteController.scala +++ b/src/main/scala/gemmini/ExecuteController.scala @@ -5,7 +5,7 @@ import chisel3._ import chisel3.util._ import GemminiISA._ import Util._ -import freechips.rocketchip.config.Parameters +import org.chipsalliance.cde.config.Parameters import midas.targetutils.PerfCounter // TODO do we still need to flush when the dataflow is weight stationary? Won't the result just keep travelling through on its own? diff --git a/src/main/scala/gemmini/FrontendTLB.scala b/src/main/scala/gemmini/FrontendTLB.scala index 63a41496..4a2e3d21 100644 --- a/src/main/scala/gemmini/FrontendTLB.scala +++ b/src/main/scala/gemmini/FrontendTLB.scala @@ -3,7 +3,7 @@ package gemmini import chisel3._ import chisel3.util._ -import freechips.rocketchip.config.Parameters +import org.chipsalliance.cde.config.Parameters import freechips.rocketchip.rocket._ import freechips.rocketchip.tile.{CoreBundle, CoreModule} import freechips.rocketchip.tilelink.TLEdgeOut diff --git a/src/main/scala/gemmini/InstructionCompression.scala b/src/main/scala/gemmini/InstructionCompression.scala index fe6cd3d9..64ab15dc 100644 --- a/src/main/scala/gemmini/InstructionCompression.scala +++ b/src/main/scala/gemmini/InstructionCompression.scala @@ -4,7 +4,7 @@ import chisel3._ import chisel3.util._ import freechips.rocketchip.tile.RoCCCommand -import freechips.rocketchip.config.Parameters +import org.chipsalliance.cde.config.Parameters import GemminiISA._ import Util._ diff --git a/src/main/scala/gemmini/LoadController.scala b/src/main/scala/gemmini/LoadController.scala index 71ecf7c7..4e5132f7 100644 --- a/src/main/scala/gemmini/LoadController.scala +++ b/src/main/scala/gemmini/LoadController.scala @@ -5,7 +5,7 @@ import chisel3._ import chisel3.util._ import GemminiISA._ import Util._ -import freechips.rocketchip.config.Parameters +import org.chipsalliance.cde.config.Parameters import midas.targetutils.PerfCounter // TODO we need to check for WAW errors here diff --git a/src/main/scala/gemmini/LoopConv.scala b/src/main/scala/gemmini/LoopConv.scala index 53032a51..db6866a9 100644 --- a/src/main/scala/gemmini/LoopConv.scala +++ b/src/main/scala/gemmini/LoopConv.scala @@ -5,7 +5,7 @@ import chisel3._ import chisel3.util._ import chisel3.experimental._ import freechips.rocketchip.tile.RoCCCommand -import freechips.rocketchip.config.Parameters +import org.chipsalliance.cde.config.Parameters import GemminiISA._ import LocalAddr._ import Util._ diff --git a/src/main/scala/gemmini/LoopMatmul.scala b/src/main/scala/gemmini/LoopMatmul.scala index a33155e9..c9e6fed3 100644 --- a/src/main/scala/gemmini/LoopMatmul.scala +++ b/src/main/scala/gemmini/LoopMatmul.scala @@ -5,7 +5,7 @@ import chisel3._ import chisel3.util._ import chisel3.experimental._ import freechips.rocketchip.tile.RoCCCommand -import freechips.rocketchip.config.Parameters +import org.chipsalliance.cde.config.Parameters import GemminiISA._ import LocalAddr._ import Util._ diff --git a/src/main/scala/gemmini/LoopUnroller.scala b/src/main/scala/gemmini/LoopUnroller.scala index 63a0150b..75ad7d3a 100644 --- a/src/main/scala/gemmini/LoopUnroller.scala +++ b/src/main/scala/gemmini/LoopUnroller.scala @@ -5,7 +5,7 @@ import chisel3.util._ import chisel3.experimental._ import freechips.rocketchip.tile.RoCCCommand -import freechips.rocketchip.config.Parameters +import org.chipsalliance.cde.config.Parameters import GemminiISA._ import Util._ diff --git a/src/main/scala/gemmini/Scratchpad.scala b/src/main/scala/gemmini/Scratchpad.scala index f6e40187..ceb8b0ae 100644 --- a/src/main/scala/gemmini/Scratchpad.scala +++ b/src/main/scala/gemmini/Scratchpad.scala @@ -3,7 +3,7 @@ package gemmini import chisel3._ import chisel3.util._ -import freechips.rocketchip.config.Parameters +import org.chipsalliance.cde.config.Parameters import freechips.rocketchip.diplomacy.{LazyModule, LazyModuleImp} import freechips.rocketchip.rocket._ import freechips.rocketchip.tile._ diff --git a/src/main/scala/gemmini/StoreController.scala b/src/main/scala/gemmini/StoreController.scala index bf4a71b2..72cd761b 100644 --- a/src/main/scala/gemmini/StoreController.scala +++ b/src/main/scala/gemmini/StoreController.scala @@ -6,7 +6,7 @@ import chisel3.util._ import chisel3.experimental._ import GemminiISA._ import Util._ -import freechips.rocketchip.config.Parameters +import org.chipsalliance.cde.config.Parameters import midas.targetutils.PerfCounter // TODO this is almost a complete copy of LoadController. We should combine them into one class diff --git a/src/main/scala/gemmini/TilerController.scala b/src/main/scala/gemmini/TilerController.scala index f3275790..45930007 100644 --- a/src/main/scala/gemmini/TilerController.scala +++ b/src/main/scala/gemmini/TilerController.scala @@ -6,7 +6,7 @@ package gemmini import chisel3._ import chisel3.util._ import chisel3.experimental._ -import freechips.rocketchip.config._ +import org.chipsalliance.cde.config._ import freechips.rocketchip.rocket._ import freechips.rocketchip.tile._ diff --git a/src/main/scala/gemmini/TilerFSM.scala b/src/main/scala/gemmini/TilerFSM.scala index db400f96..010bed9a 100644 --- a/src/main/scala/gemmini/TilerFSM.scala +++ b/src/main/scala/gemmini/TilerFSM.scala @@ -6,7 +6,7 @@ package gemmini import chisel3._ import chisel3.util._ import chisel3.experimental._ -import freechips.rocketchip.config._ +import org.chipsalliance.cde.config._ import freechips.rocketchip.tile._ import GemminiISA._ import Util.regwire diff --git a/src/main/scala/gemmini/TilerScheduler.scala b/src/main/scala/gemmini/TilerScheduler.scala index d9fd8b6a..0b273a5a 100644 --- a/src/main/scala/gemmini/TilerScheduler.scala +++ b/src/main/scala/gemmini/TilerScheduler.scala @@ -5,7 +5,7 @@ package gemmini import chisel3._ import chisel3.util._ -import freechips.rocketchip.config._ +import org.chipsalliance.cde.config._ import freechips.rocketchip.tile._ import GemminiISA._ import Util._ diff --git a/src/main/scala/gemmini/TransposePreloadUnroller.scala b/src/main/scala/gemmini/TransposePreloadUnroller.scala index 18d7320d..68407344 100644 --- a/src/main/scala/gemmini/TransposePreloadUnroller.scala +++ b/src/main/scala/gemmini/TransposePreloadUnroller.scala @@ -3,7 +3,7 @@ package gemmini import chisel3._ import chisel3.util._ import chisel3.experimental.ChiselEnum -import chipsalliance.rocketchip.config.Parameters +import org.chipsalliance.cde.config.Parameters import Util._ import midas.targetutils.PerfCounter From b51f7a1af48f2c9ff91bcce8b4adc7f2336dff92 Mon Sep 17 00:00:00 2001 From: Jerry Zhao Date: Mon, 20 Mar 2023 23:48:01 -0700 Subject: [PATCH 57/64] Bump chipyard hash --- CHIPYARD.hash | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHIPYARD.hash b/CHIPYARD.hash index 747b6b18..b51f4be7 100644 --- a/CHIPYARD.hash +++ b/CHIPYARD.hash @@ -1 +1 @@ -bcbe3b7f1f40d1c388aca68df498fd7dd4d16e89 +52671ba069ae17c68a611b1e0bffc692d8cb2f4a From 965ea0b3c5ffd7b68e6bf5a50ea9ff0750a5b988 Mon Sep 17 00:00:00 2001 From: Jerry Zhao Date: Fri, 31 Mar 2023 17:26:07 -0700 Subject: [PATCH 58/64] Fix LoopConv variable naming --- src/main/scala/gemmini/LoopConv.scala | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/main/scala/gemmini/LoopConv.scala b/src/main/scala/gemmini/LoopConv.scala index db6866a9..bc87ae10 100644 --- a/src/main/scala/gemmini/LoopConv.scala +++ b/src/main/scala/gemmini/LoopConv.scala @@ -661,12 +661,12 @@ class LoopConvExecute(block_size: Int, large_iterator_bitwidth: Int, small_itera // val new_weights = b === 0.U && orow === 0.U && ocol === 0.U val new_weights = Reg(Bool()) - val krow_ = Mux(req.wrot180, krows - krow - 1.U, krow) - val kcol_ = Mux(req.wrot180, kcols - kcol - 1.U, kcol) + val krow_rot = Mux(req.wrot180, krows - krow - 1.U, krow) + val kcol_rot = Mux(req.wrot180, kcols - kcol - 1.U, kcol) val b_addr = Mux(req.trans_weight_0132, - b_addr_start +& (kch / block_size.U(och.getWidth.W)) * krows * kcols * ochs +& krow_ * kcols * ochs +& kcol_ * ochs +& och, - b_addr_start +& (och / block_size.U(och.getWidth.W)) * krows * kcols * kchs +& krow_ * kcols * kchs +& kcol_ * kchs +& kch) + b_addr_start +& (kch / block_size.U(och.getWidth.W)) * krows * kcols * ochs +& krow_rot * kcols * ochs +& kcol_rot * ochs +& och, + b_addr_start +& (och / block_size.U(och.getWidth.W)) * krows * kcols * kchs +& krow_rot * kcols * kchs +& kcol_rot * kchs +& kch) class RoCCCommandWithAddr extends Bundle { val cmd = new RoCCCommand From 934058a2e37b38682fc38db13c75fafcbb20a619 Mon Sep 17 00:00:00 2001 From: Jerry Zhao Date: Tue, 4 Apr 2023 11:35:45 -0700 Subject: [PATCH 59/64] Bump Chipyard --- CHIPYARD.hash | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHIPYARD.hash b/CHIPYARD.hash index b51f4be7..ed413fdd 100644 --- a/CHIPYARD.hash +++ b/CHIPYARD.hash @@ -1 +1 @@ -52671ba069ae17c68a611b1e0bffc692d8cb2f4a +569917e2f30616f85a841d16a92914ae98ad7184 From 9debd98b03b2835538ea47acdf391d73d816ee67 Mon Sep 17 00:00:00 2001 From: Hasan Genc Date: Sun, 7 May 2023 19:58:40 -0700 Subject: [PATCH 60/64] fix simulator args in run-midas.sh --- scripts/run-midas.sh | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/scripts/run-midas.sh b/scripts/run-midas.sh index 63616809..8de7a0fc 100755 --- a/scripts/run-midas.sh +++ b/scripts/run-midas.sh @@ -120,7 +120,17 @@ if [ ! -f ./${simulator}${DEBUG} ]; then echo 'Did you run `./scripts/build-midas.sh`?' fi +sim_args="+vcs+initreg+0 +vcs+initmem+0 +fesvr-step-size=128 +mm_relaxFunctionalModel_0=0 +mm_openPagePolicy_0=1 +mm_backendLatency_0=2 +mm_dramTimings_tAL_0=0 +mm_dramTimings_tCAS_0=14 +mm_dramTimings_tCMD_0=1 +mm_dramTimings_tCWD_0=10 +mm_dramTimings_tCCD_0=4 +mm_dramTimings_tFAW_0=25 +mm_dramTimings_tRAS_0=33 +mm_dramTimings_tREFI_0=7800 +mm_dramTimings_tRC_0=47 +mm_dramTimings_tRCD_0=14 +mm_dramTimings_tRFC_0=160 +mm_dramTimings_tRRD_0=8 +mm_dramTimings_tRP_0=14 +mm_dramTimings_tRTP_0=8 +mm_dramTimings_tRTRS_0=2 +mm_dramTimings_tWR_0=15 +mm_dramTimings_tWTR_0=8 +mm_rowAddr_offset_0=18 +mm_rowAddr_mask_0=65535 +mm_rankAddr_offset_0=16 +mm_rankAddr_mask_0=3 +mm_bankAddr_offset_0=13 +mm_bankAddr_mask_0=7 +shmemportname0=0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 +macaddr0=00:00:00:00:00:02 +niclog0=niclog0 +linklatency0=6405 +netbw0=100 +netburst0=8 +nic-loopback0 +tracefile=TRACEFILE +blkdev-in-mem0=128 +blkdev-log0=blkdev-log0 +autocounter-readrate=1000 +autocounter-filename=AUTOCOUNTERFILE +max-cycles=100000000 +dramsim +dramsim_ini_dir=/home/eecs/hngenc/chip/generators/testchipip/src/main/resources/dramsim2_ini" + +if [ $dram_model == "DDR3FRFCFS" ] || [ $dram_model == "DDR3FRFCFSLLC4MB" ]; then + sim_args="$sim_args +mm_schedulerWindowSize_0=8 +mm_transactionQueueDepth_0=8" +fi + +if [ $dram_model == "DDR3FRFCFSLLC4MB" ]; then + sim_args="$sim_args +mm_llc_wayBits_0=3 +mm_llc_setBits_0=12 +mm_llc_blockBits_0=7 +mm_llc_activeMSHRs_0=8" +fi + ./${simulator}${DEBUG} ${PK} ${full_binary_path} ${waveform_flag} \ - +vcs+initreg+0 +vcs+initmem+0 +fesvr-step-size=128 +mm_relaxFunctionalModel_0=0 +mm_openPagePolicy_0=1 +mm_backendLatency_0=2 +mm_schedulerWindowSize_0=8 +mm_transactionQueueDepth_0=8 +mm_dramTimings_tAL_0=0 +mm_dramTimings_tCAS_0=14 +mm_dramTimings_tCMD_0=1 +mm_dramTimings_tCWD_0=10 +mm_dramTimings_tCCD_0=4 +mm_dramTimings_tFAW_0=25 +mm_dramTimings_tRAS_0=33 +mm_dramTimings_tREFI_0=7800 +mm_dramTimings_tRC_0=47 +mm_dramTimings_tRCD_0=14 +mm_dramTimings_tRFC_0=160 +mm_dramTimings_tRRD_0=8 +mm_dramTimings_tRP_0=14 +mm_dramTimings_tRTP_0=8 +mm_dramTimings_tRTRS_0=2 +mm_dramTimings_tWR_0=15 +mm_dramTimings_tWTR_0=8 +mm_rowAddr_offset_0=18 +mm_rowAddr_mask_0=65535 +mm_rankAddr_offset_0=16 +mm_rankAddr_mask_0=3 +mm_bankAddr_offset_0=13 +mm_bankAddr_mask_0=7 +mm_llc_wayBits_0=3 +mm_llc_setBits_0=12 +mm_llc_blockBits_0=7 +mm_llc_activeMSHRs_0=8 +shmemportname0=0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 +macaddr0=00:00:00:00:00:02 +niclog0=niclog0 +linklatency0=6405 +netbw0=100 +netburst0=8 +nic-loopback0 +tracefile=TRACEFILE +blkdev-in-mem0=128 +blkdev-log0=blkdev-log0 +autocounter-readrate=1000 +autocounter-filename=AUTOCOUNTERFILE +max-cycles=100000000 \ - +dramsim +dramsim_ini_dir=/home/eecs/hngenc/chip/generators/testchipip/src/main/resources/dramsim2_ini \ + $sim_args \ 2>/dev/null + From 7c16dcf349e84c23f2af5176043185e3046b909b Mon Sep 17 00:00:00 2001 From: Hasan Genc Date: Tue, 9 May 2023 22:10:41 -0700 Subject: [PATCH 61/64] set system-bus-width to match Gemmini's dma-bus-width --- README.md | 2 +- src/main/scala/gemmini/CustomSoCConfigs.scala | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 4caf35bb..1a1db0ce 100644 --- a/README.md +++ b/README.md @@ -32,7 +32,7 @@ Run these steps to install Chipyard and Spike (make sure to checkout the correct ```shell git clone https://github.com/ucb-bar/chipyard.git cd chipyard -git checkout 1.8.1 +git checkout 1.9.1 ./build-setup.sh riscv-tools source env.sh diff --git a/src/main/scala/gemmini/CustomSoCConfigs.scala b/src/main/scala/gemmini/CustomSoCConfigs.scala index 58636db1..cdf2fd54 100644 --- a/src/main/scala/gemmini/CustomSoCConfigs.scala +++ b/src/main/scala/gemmini/CustomSoCConfigs.scala @@ -1,7 +1,8 @@ /* package chipyard -import org.chipsalliance.cde.config.{Config} +import org.chipsalliance.cde.config.Config +import gemmini.{GemminiCustomConfig, GemminiCustomConfigs} class CustomGemminiSoCConfig extends Config( new gemmini.GemminiCustomConfig ++ @@ -19,6 +20,7 @@ class CustomGemminiSoCConfig extends Config( // Set the number of CPUs you want to create new chipyard.CustomGemmminiCPUConfigs.CustomCPU(1) ++ + new chipyard.config.WithSystemBusWidth(GemminiCustomConfigs.customConfig.dma_buswidth) ++ new chipyard.config.AbstractConfig ) */ From 4070e2c9dcb16d24445ceabec9e5b4a7215e5ba7 Mon Sep 17 00:00:00 2001 From: Hasan Genc Date: Tue, 9 May 2023 22:53:31 -0700 Subject: [PATCH 62/64] update readme to include scratchpad-memory stride in config-mvin --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 1a1db0ce..e05dadb1 100644 --- a/README.md +++ b/README.md @@ -496,8 +496,9 @@ This limitation may be lifted in the future. - `rs1[1:0]` must be `01` - `rs1[2]` is 0 if `mvin`s to the accumulator are of type `accType`, and 1 if they are `inputType` - `rs1[4:3]` is 0 if the stride is being set for `mvin`, 1 if the stride is being set for `mvin2`, and 2 if the stride is being set for `mvin3` +- `rs1[31:16]` is the scratchpad-memory stride (also called the "private-memory stride" above) - `rs1[63:32]` is the "scale" by which to multiply data as it's being moved in to the scratchpad. This is ignored if Gemmini isn't configured to have the ability to scale values during `mvin`s. -- `rs2` = the stride in bytes +- `rs2` is the main-memory stride in bytes - `funct` = 0 **Action:** stride <= rs2; scale <= rs1[63:32] From d48db378f5f7dabeb677cfc0536aecb7711e37ad Mon Sep 17 00:00:00 2001 From: Charles Hong <31425218+charleshong3@users.noreply.github.com> Date: Wed, 10 May 2023 10:23:29 -0700 Subject: [PATCH 63/64] add timeloop arch and mapspace YAMLs (#279) --- modeling/timeloop/arch/arch_default.yaml | 64 ++++++++++++++++++++++++ modeling/timeloop/mapspace/mapspace.yaml | 50 ++++++++++++++++++ 2 files changed, 114 insertions(+) create mode 100644 modeling/timeloop/arch/arch_default.yaml create mode 100644 modeling/timeloop/mapspace/mapspace.yaml diff --git a/modeling/timeloop/arch/arch_default.yaml b/modeling/timeloop/arch/arch_default.yaml new file mode 100644 index 00000000..84c7ca1c --- /dev/null +++ b/modeling/timeloop/arch/arch_default.yaml @@ -0,0 +1,64 @@ +# A Timeloop-compatible YAML definition of an architecture similar to Gemmini +# WARNING: Not correlated with Gemmini RTL +architecture: + version: 0.3 + + subtree: + - name: System + local: + - name: DRAM + class: DRAM + attributes: + instances: 1 + word-bits: 8 + block_size: 64 + shared_bandwidth: 8 + + subtree: + - name: Chip + attributes: + technology: 40nm + local: + - name: Scratchpad + class: SRAM + attributes: + entries: 262144 + depth: 16384 + width: 128 + instances: 1 + meshX: 1 + word-bits: 8 + n_rdwr_ports: 2 + n_banks: 4 + subtree: + - name: PECols[0..15] # only the K dim can be parallelized across Accumulator columns + local: + - name: Accumulator + class: SRAM + attributes: + entries: 1024 # acc size / pe_dim = 16384/16 + depth: 1024 + width: 32 + instances: 16 + word-bits: 32 + network-word-bits: 16 + n_rdwr_ports: 2 + n_banks: 2 + subtree: + - name: PERows[0..15] + local: + - name: Registers + class: SRAM + attributes: + depth: 1 + width: 8 + entries: 1 + instances: 256 + word-bits: 8 + n_rdwr_ports: 2 + n_banks: 1 + - name: MACC + class: intmac + attributes: + datawidth: 8 + word-bits: 8 diff --git a/modeling/timeloop/mapspace/mapspace.yaml b/modeling/timeloop/mapspace/mapspace.yaml new file mode 100644 index 00000000..a4cf3327 --- /dev/null +++ b/modeling/timeloop/mapspace/mapspace.yaml @@ -0,0 +1,50 @@ +mapspace_constraints: + - target: Registers + type: bypass + bypass: + - Outputs + - Inputs + keep: + - Weights + - target: Accumulator + type: bypass + bypass: + - Weights + - Inputs + keep: + - Outputs + - target: Scratchpad + type: bypass + keep: + - Inputs + - Weights + bypass: + - Outputs + - target: DRAM + type: bypass + keep: + - Weights + - Inputs + - Outputs + bypass: [] + - target: Registers + type: temporal + factors: R=1 S=1 P=1 Q=1 C=1 K=1 N=1 + permutation: PQRSCKN + - target: Accumulator + type: spatial + factors: R=1 S=1 P=1 Q=1 C<=16 K=1 N=1 + permutation: QKC + - target: Accumulator + type: temporal + permutation: QPNCSRK + - target: Scratchpad + type: spatial + factors: R=1 S=1 P=1 Q=1 N=1 C=1 K<=16 + - target: Scratchpad + type: temporal + factors: R=1 S=1 P=1 Q=1 C=1 K=1 N=1 + - target: DRAM + type: temporal + permutation: CSRKQPN + From 2efd84adc4d1f320187845060375507f51db983b Mon Sep 17 00:00:00 2001 From: Hasan Genc Date: Mon, 22 May 2023 13:20:42 -0700 Subject: [PATCH 64/64] Update checkout instruction in README --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index e05dadb1..7d6eb790 100644 --- a/README.md +++ b/README.md @@ -39,7 +39,7 @@ source env.sh cd generators/gemmini git config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*" -git checkout dev && git pull origin dev +git fetch && git checkout v0.7.1 git submodule update --init --recursive make -C software/libgemmini install