From 4c162b9d358f3ad9e0cfde72838412990f55f8a8 Mon Sep 17 00:00:00 2001
From: Hasan Genc <hngenc@berkeley.edu>
Date: Sat, 19 Mar 2022 10:14:31 -0700
Subject: [PATCH 01/64] Mobilenet improvements (#209)

Offloads depthwise convolution scheduling onto the loop-unroller FSM. This improves MobileNet performance by 2-3x.

Also adds more printfs to `mobilenet.c` to show the performance of each individual layer.
---
 SPIKE.hash                                  |  2 +-
 software/gemmini-rocc-tests                 |  2 +-
 src/main/scala/gemmini/Controller.scala     |  2 +-
 src/main/scala/gemmini/GemminiConfigs.scala |  1 +
 src/main/scala/gemmini/GemminiISA.scala     |  2 +-
 src/main/scala/gemmini/LoopConv.scala       | 13 ++++++++++---
 6 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/SPIKE.hash b/SPIKE.hash
index f08ac921..e1be3611 100644
--- a/SPIKE.hash
+++ b/SPIKE.hash
@@ -1 +1 @@
-090e82c473fd28b4eb2011ffcd771ead6076faab
+bd19c16c779d419c322790b81a5ac57e47773c1c
diff --git a/software/gemmini-rocc-tests b/software/gemmini-rocc-tests
index e326e7c4..6ad94746 160000
--- a/software/gemmini-rocc-tests
+++ b/software/gemmini-rocc-tests
@@ -1 +1 @@
-Subproject commit e326e7c43457ff08669fe88edcaa395d846474d8
+Subproject commit 6ad94746cb72ba6824b101dc6a46d5e997cd17b3
diff --git a/src/main/scala/gemmini/Controller.scala b/src/main/scala/gemmini/Controller.scala
index 3ff10955..cf572c0c 100644
--- a/src/main/scala/gemmini/Controller.scala
+++ b/src/main/scala/gemmini/Controller.scala
@@ -150,7 +150,7 @@ class GemminiModule[T <: Data: Arithmetic, U <: Data, V <: Data]
     new ConfigExRs1(acc_scale_t_bits), new PreloadRs(mvin_rows_bits, mvin_cols_bits, local_addr_t),
     new PreloadRs(mvout_rows_bits, mvout_cols_bits, local_addr_t),
     new ComputeRs(mvin_rows_bits, mvin_cols_bits, local_addr_t), new ComputeRs(mvin_rows_bits, mvin_cols_bits, local_addr_t),
-    has_training_convs, has_max_pool, has_first_layer_optimizations) }
+    has_training_convs, has_max_pool, has_first_layer_optimizations, has_dw_convs) }
 
   val (loop_cmd, loop_matmul_unroller_busy) = withClock (gated_clock) { LoopMatmul(conv_cmd, reservation_station.io.matmul_ld_completed, reservation_station.io.matmul_st_completed, reservation_station.io.matmul_ex_completed,
     meshRows*tileRows, coreMaxAddrBits, reservation_station_entries, max_lds, max_exs, max_sts, sp_banks * sp_bank_entries, acc_banks * acc_bank_entries,
diff --git a/src/main/scala/gemmini/GemminiConfigs.scala b/src/main/scala/gemmini/GemminiConfigs.scala
index 97e068c8..40c1c777 100644
--- a/src/main/scala/gemmini/GemminiConfigs.scala
+++ b/src/main/scala/gemmini/GemminiConfigs.scala
@@ -85,6 +85,7 @@ case class GemminiArrayConfig[T <: Data : Arithmetic, U <: Data, V <: Data](
                                                                              has_training_convs: Boolean = true,
                                                                              has_max_pool: Boolean = true,
                                                                              has_nonlinear_activations: Boolean = true,
+                                                                             has_dw_convs: Boolean = true,
 
                                                                              has_first_layer_optimizations: Boolean = true,
 
diff --git a/src/main/scala/gemmini/GemminiISA.scala b/src/main/scala/gemmini/GemminiISA.scala
index 9cb15ac9..ea3aed12 100644
--- a/src/main/scala/gemmini/GemminiISA.scala
+++ b/src/main/scala/gemmini/GemminiISA.scala
@@ -24,7 +24,7 @@ object GemminiISA {
   val LOAD3_CMD = 14.U
 
   // TODO add orows and ocols to this as well
-  val LOOP_CONV_WS = 15.U // no_bias, wrot180, trans_output_1203, trans_weight_1203, trans_input_3120, max_pixels_per_row | no_pool, downsample, input_dilated, act
+  val LOOP_CONV_WS = 15.U // no_bias, wrot180, trans_output_1203, trans_weight_1203, trans_input_3120, dw, max_pixels_per_row | no_pool, downsample, input_dilated, act
   val LOOP_CONV_WS_CONFIG_1 = 16.U // batch_size, in_dim, in_channels, out_channels | out_dim, pool_out_dim, stride, padding
   val LOOP_CONV_WS_CONFIG_2 = 17.U // kernel_dim, pool_size, pool_stride, pool_padding | batches, porows, pocols, pochs
   val LOOP_CONV_WS_CONFIG_3 = 18.U // krows, kcols, kchs, lpad | rpad, upad, dpad, plpad
diff --git a/src/main/scala/gemmini/LoopConv.scala b/src/main/scala/gemmini/LoopConv.scala
index 16609f5a..3d7a099e 100644
--- a/src/main/scala/gemmini/LoopConv.scala
+++ b/src/main/scala/gemmini/LoopConv.scala
@@ -397,6 +397,7 @@ class LoopConvLdWeightReq(val coreMaxAddrBits: Int, val large_iterator_bitwidth:
   val dram_addr = UInt(coreMaxAddrBits.W)
   val trans_weight_1203 = Bool()
   val trans_weight_0132 = Bool()
+  val dw = Bool()
   val loop_id = UInt(log2Up(concurrent_loops).W)
 }
 
@@ -439,6 +440,7 @@ class LoopConvLdWeight(block_size: Int, coreMaxAddrBits: Int, large_iterator_bit
   val addr_start = req.addr_end - B_rows
 
   val dram_stride = MuxCase(out_channels, Seq(
+    req.dw -> 1.U,
     req.trans_weight_1203 -> (kernel_dim * kernel_dim * out_channels),
     req.trans_weight_0132 -> in_channels
   )) * (input_w/8).U
@@ -451,6 +453,7 @@ class LoopConvLdWeight(block_size: Int, coreMaxAddrBits: Int, large_iterator_bit
 
   // Addresses
   val dram_offset = MuxCase(((krow*kernel_dim*in_channels +& kcol*in_channels +& kch) * out_channels +& och) * (input_w/8).U, Seq(
+    req.dw -> (krow * kernel_dim +& kcol) * (input_w/8).U,
     req.trans_weight_1203 -> (((kch*kernel_dim*kernel_dim +& krow*kernel_dim +& kcol) * out_channels +& och) * (input_w/8).U),
     req.trans_weight_0132 -> (((krow*kernel_dim*out_channels +& kcol*out_channels +& och) * in_channels +& kch) * (input_w/8).U)
   ))
@@ -1059,6 +1062,7 @@ class LoopConvState(val block_size: Int, val large_iterator_bitwidth: Int, val s
   val trans_weight_1203 = Bool()
   val trans_weight_0132 = Bool()
   val trans_input_3120 = Bool()
+  val dw = Bool()
 
   val max_pixels_per_row = UInt(small_iterator_bitwidth.W)
 
@@ -1150,7 +1154,8 @@ class LoopConv (block_size: Int, coreMaxAddrBits: Int, reservation_station_size:
                 config_mvin_rs1_t: ConfigMvinRs1, mvin_rs2_t: MvinRs2, config_mvout_rs2_t: ConfigMvoutRs2, mvout_rs2_t: MvoutRs2,
                 config_ex_rs1_t: ConfigExRs1, preload_rs1_t: PreloadRs, preload_rs2_t: PreloadRs,
                 compute_rs1_t: ComputeRs, compute_rs2_t: ComputeRs,
-                has_training_convs: Boolean, has_max_pool: Boolean, has_first_layer_optimizations: Boolean)
+                has_training_convs: Boolean, has_max_pool: Boolean, has_first_layer_optimizations: Boolean,
+                has_dw_convs: Boolean)
   (implicit p: Parameters) extends Module {
   val large_iterator_bitwidth = 16
   val small_iterator_bitwidth = 16 // 8
@@ -1330,6 +1335,7 @@ class LoopConv (block_size: Int, coreMaxAddrBits: Int, reservation_station_size:
         loop_being_configured.trans_weight_1203 := has_training_convs.B && cmd.bits.cmd.rs1(3)
         loop_being_configured.trans_weight_0132 := has_training_convs.B && cmd.bits.cmd.rs1(4)
         loop_being_configured.trans_input_3120 := has_training_convs.B && cmd.bits.cmd.rs1(5)
+        loop_being_configured.dw := has_dw_convs.B && cmd.bits.cmd.rs1(6)
 
         loop_being_configured.no_pool := !has_max_pool.B || cmd.bits.cmd.rs2(0)
         loop_being_configured.activation := cmd.bits.cmd.rs2(4,3)
@@ -1400,6 +1406,7 @@ class LoopConv (block_size: Int, coreMaxAddrBits: Int, reservation_station_size:
   ld_weights.io.req.bits.dram_addr := loop_requesting_ld_weights.weights_dram_addr
   ld_weights.io.req.bits.trans_weight_1203 := loop_requesting_ld_weights.trans_weight_1203
   ld_weights.io.req.bits.trans_weight_0132 := loop_requesting_ld_weights.trans_weight_0132
+  ld_weights.io.req.bits.dw := loop_requesting_ld_weights.dw
   ld_weights.io.req.bits.loop_id := loop_requesting_ld_weights_id
 
   ld_weights.io.req.valid := !loop_requesting_ld_weights.ld_weights_started && loop_requesting_ld_weights.configured
@@ -1503,13 +1510,13 @@ object LoopConv {
             config_mvin_rs1_t: ConfigMvinRs1, mvin_rs2_t: MvinRs2, config_mvout_rs2_t: ConfigMvoutRs2,
             mvout_rs2_t: MvoutRs2, config_ex_rs1_t: ConfigExRs1, preload_rs1_t: PreloadRs, preload_rs2_t: PreloadRs,
             compute_rs1_t: ComputeRs, compute_rs2_t: ComputeRs, has_training_convs: Boolean, has_max_pool: Boolean,
-            has_first_layer_optimizations: Boolean)
+            has_first_layer_optimizations: Boolean, has_dw_convs: Boolean)
            (implicit p: Parameters): (DecoupledIO[GemminiCmd], Bool) = {
 
     val mod = Module(new LoopConv(block_size, coreMaxAddrBits, rob_size, max_lds, max_exs, max_sts,
       max_addr, max_acc_addr, input_w, acc_w, dma_max_bytes,
       config_mvin_rs1_t, mvin_rs2_t, config_mvout_rs2_t, mvout_rs2_t, config_ex_rs1_t, preload_rs1_t, preload_rs2_t,
-      compute_rs1_t, compute_rs2_t, has_training_convs, has_max_pool, has_first_layer_optimizations))
+      compute_rs1_t, compute_rs2_t, has_training_convs, has_max_pool, has_first_layer_optimizations, has_dw_convs))
 
     mod.io.in <> in
     mod.io.ld_completed := ld_completed

From b9ccd8405d48ec9198921b9086740e2aa2e36192 Mon Sep 17 00:00:00 2001
From: Hasan Genc <hngenc@berkeley.edu>
Date: Wed, 4 May 2022 14:38:58 -0700
Subject: [PATCH 02/64] Bump ort to 2021-12-23 (#218)

Fixes ucb-bar/onnxruntime-riscv#76
---
 software/onnxruntime-riscv | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/software/onnxruntime-riscv b/software/onnxruntime-riscv
index 7bbd0496..0c8c9b4f 160000
--- a/software/onnxruntime-riscv
+++ b/software/onnxruntime-riscv
@@ -1 +1 @@
-Subproject commit 7bbd0496b579863c6906c0449932ac5ddc4c5357
+Subproject commit 0c8c9b4f881b5f31d32c6b5a76cac4ee14a8f338

From 073e073ca8552ab5e23ec7bc31f65aeedccbcf34 Mon Sep 17 00:00:00 2001
From: Hasan Genc <hngenc@berkeley.edu>
Date: Tue, 23 Aug 2022 09:45:09 -0700
Subject: [PATCH 03/64] Add support for I-BERT's layernorm, softmax, and gelu
 ops (#237)

---
 SPIKE.hash                                    |   2 +-
 scripts/build-onnx-inference.sh               |   2 +-
 scripts/build-vcs.sh                          |   9 +-
 scripts/build-verilator.sh                    |   7 +-
 software/gemmini-ort.json                     |   2 +-
 software/gemmini-rocc-tests                   |   2 +-
 software/gemmini-tests-interactive.json       |   3 +-
 software/gemmini-tests.json                   |   3 +-
 src/main/scala/gemmini/AccumulatorMem.scala   |  46 +-
 src/main/scala/gemmini/AccumulatorScale.scala | 193 ++++--
 src/main/scala/gemmini/Activation.scala       |   6 +-
 src/main/scala/gemmini/Arithmetic.scala       | 276 ++++++--
 src/main/scala/gemmini/Configs.scala          |   2 +-
 src/main/scala/gemmini/Controller.scala       |   1 -
 src/main/scala/gemmini/CustomConfigs.scala    |   8 +
 .../scala/gemmini/DMACommandTracker.scala     |   1 -
 .../scala/gemmini/ExecuteController.scala     |  20 +-
 src/main/scala/gemmini/GemminiConfigs.scala   |   7 +-
 src/main/scala/gemmini/GemminiISA.scala       |  36 +-
 src/main/scala/gemmini/LocalAddr.scala        |   9 +-
 src/main/scala/gemmini/LoopConv.scala         |   4 +-
 src/main/scala/gemmini/LoopMatmul.scala       | 110 ++-
 src/main/scala/gemmini/NormCmd.scala          |  23 +
 src/main/scala/gemmini/Normalizer.scala       | 635 ++++++++++++++++++
 .../scala/gemmini/ReservationStation.scala    |  10 +-
 src/main/scala/gemmini/Scratchpad.scala       | 149 ++--
 src/main/scala/gemmini/StoreController.scala  |  71 +-
 27 files changed, 1396 insertions(+), 241 deletions(-)
 create mode 100644 src/main/scala/gemmini/NormCmd.scala
 create mode 100644 src/main/scala/gemmini/Normalizer.scala

diff --git a/SPIKE.hash b/SPIKE.hash
index e1be3611..27baea53 100644
--- a/SPIKE.hash
+++ b/SPIKE.hash
@@ -1 +1 @@
-bd19c16c779d419c322790b81a5ac57e47773c1c
+2ed403a70f65559a3c2a06bf724d4737edc73a23
diff --git a/scripts/build-onnx-inference.sh b/scripts/build-onnx-inference.sh
index 23742f5c..01d6e8ce 100755
--- a/scripts/build-onnx-inference.sh
+++ b/scripts/build-onnx-inference.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-cd /root/chipyard/generators/gemmini/software/onnxruntime-riscv/
+cd ./software/onnxruntime-riscv/
 rm -rf ./build/
 ./build.sh --parallel --enable_training --config=Debug --cmake_extra_defines onnxruntime_USE_SYSTOLIC=ON onnxruntime_SYSTOLIC_INT8=ON onnxruntime_SYSTOLIC_FP32=OFF
 cd ./systolic_runner/imagenet_runner/
diff --git a/scripts/build-vcs.sh b/scripts/build-vcs.sh
index e3213521..b15a23f2 100755
--- a/scripts/build-vcs.sh
+++ b/scripts/build-vcs.sh
@@ -4,21 +4,24 @@ help () {
   echo "Build a cycle-accurate VCS simulator for RISCV Gemmini programs,"
   echo 'matching `customConfig` in `configs/GemminiCustomConfigs.scala`.'
   echo
-  echo "Usage: $0 [-h|--help] [--debug]"
+  echo "Usage: $0 [-h|--help] [--debug] [-j [N]]"
   echo
   echo "Options:"
   echo " debug   Builds a VCS simulator which generates waveforms. Without this"
   echo "         option, the simulator will not generate any waveforms."
+  echo " j [N]   Allow N jobs at once; infinite jobs with no arg."
   exit
 }
 
 show_help=0
 debug=""
+j="1"
 
 while [ $# -gt 0 ] ; do
   case $1 in
     -h | --help) show_help=1 ;;
-    --debug) debug="debug"
+    --debug) debug="debug" ;;
+    -j) j=$1
   esac
 
   shift
@@ -29,5 +32,5 @@ if [ $show_help -eq 1 ]; then
 fi
 
 cd ../../sims/vcs/
-make ${debug} CONFIG=CustomGemminiSoCConfig
+make -j$j ${debug} CONFIG=CustomGemminiSoCConfig
 
diff --git a/scripts/build-verilator.sh b/scripts/build-verilator.sh
index 965d335b..49a25f29 100755
--- a/scripts/build-verilator.sh
+++ b/scripts/build-verilator.sh
@@ -4,21 +4,24 @@ help () {
   echo "Build a cycle-accurate Verilator simulator for RISCV Gemmini programs,"
   echo 'matching `customConfig` in `configs/GemminiCustomConfigs.scala`.'
   echo
-  echo "Usage: $0 [-h|--help] [--debug]"
+  echo "Usage: $0 [-h|--help] [--debug] [-j [N]]"
   echo
   echo "Options:"
   echo " debug   Builds a Verilator simulator which generates waveforms. Without"
   echo "         this option, the simulator will not generate any waveforms."
+  echo " j [N]   Allow N jobs at once; infinite jobs with no arg."
   exit
 }
 
 show_help=0
 debug=""
+j="1"
 
 while [ $# -gt 0 ] ; do
   case $1 in
     -h | --help) show_help=1 ;;
-    --debug) debug="debug"
+    --debug) debug="debug" ;;
+    -j) j=$1
   esac
 
   shift
diff --git a/software/gemmini-ort.json b/software/gemmini-ort.json
index 7f561d79..c4a95253 100644
--- a/software/gemmini-ort.json
+++ b/software/gemmini-ort.json
@@ -52,7 +52,7 @@
     "/output/mobilenet_optimized_ws_nhwc_out.txt"
   ],
   "overlay": "../onnxruntime-riscv/systolic_runner/imagenet_runner",
-  "rootfs-size": "1GiB",
+  "rootfs-size": "16GiB",
   "run": "run-ort.sh"
 }
 
diff --git a/software/gemmini-rocc-tests b/software/gemmini-rocc-tests
index 6ad94746..37464740 160000
--- a/software/gemmini-rocc-tests
+++ b/software/gemmini-rocc-tests
@@ -1 +1 @@
-Subproject commit 6ad94746cb72ba6824b101dc6a46d5e997cd17b3
+Subproject commit 374647403d5e73543463f9f66f730aa16bc8e362
diff --git a/software/gemmini-tests-interactive.json b/software/gemmini-tests-interactive.json
index 8bd5f7ea..0fe52409 100644
--- a/software/gemmini-tests-interactive.json
+++ b/software/gemmini-tests-interactive.json
@@ -3,5 +3,6 @@
   "workdir" : ".",
   "base" : "br-base.json",
   "overlay" : "overlay",
-  "host-init" : "host-init.sh"
+  "host-init" : "host-init.sh",
+  "rootfs-size" : "16GiB"
 }
diff --git a/software/gemmini-tests.json b/software/gemmini-tests.json
index 72f8661c..fc0e45a9 100644
--- a/software/gemmini-tests.json
+++ b/software/gemmini-tests.json
@@ -4,5 +4,6 @@
   "base" : "br-base.json",
   "overlay" : "overlay",
   "host-init" : "host-init.sh",
-  "command": "/root/run-tests.sh"
+  "command": "/root/run-tests.sh",
+  "rootfs-size" : "16GiB"
 }
diff --git a/src/main/scala/gemmini/AccumulatorMem.scala b/src/main/scala/gemmini/AccumulatorMem.scala
index f8b62298..dd5ed821 100644
--- a/src/main/scala/gemmini/AccumulatorMem.scala
+++ b/src/main/scala/gemmini/AccumulatorMem.scala
@@ -5,30 +5,35 @@ import chisel3.util._
 
 import Util._
 
-class AccumulatorReadReq[T <: Data](n: Int, shift_width: Int, scale_t: T) extends Bundle {
+class AccumulatorReadReq[T <: Data: Arithmetic, U <: Data](n: Int, acc_t: T, scale_t: U) extends Bundle {
   val addr = UInt(log2Ceil(n).W)
   val scale = scale_t
-  val relu6_shift = UInt(shift_width.W)
-  val act = UInt(2.W) // TODO magic number
+  val igelu_qb = acc_t.cloneType
+  val igelu_qc = acc_t.cloneType
+  val iexp_qln2 = acc_t.cloneType
+  val iexp_qln2_inv = acc_t.cloneType
+  val act = UInt(Activation.bitwidth.W) // TODO magic number
   val full = Bool() // Whether or not we return the full bitwidth output
 
   val fromDMA = Bool()
 
 }
 
-class AccumulatorReadResp[T <: Data: Arithmetic, U <: Data](fullDataType: Vec[Vec[T]], scale_t: U, shift_width: Int) extends Bundle {
+class AccumulatorReadResp[T <: Data: Arithmetic, U <: Data](fullDataType: Vec[Vec[T]], scale_t: U) extends Bundle {
   val data = fullDataType.cloneType
   val fromDMA = Bool()
   val scale = scale_t.cloneType
-  val relu6_shift = UInt(shift_width.W)
-  val act = UInt(2.W) // TODO magic number
-  val acc_bank_id = UInt(2.W) // TODO don't hardcode
+  val igelu_qb = fullDataType.head.head.cloneType
+  val igelu_qc = fullDataType.head.head.cloneType
+  val iexp_qln2 = fullDataType.head.head.cloneType
+  val iexp_qln2_inv = fullDataType.head.head.cloneType
+  val act = UInt(Activation.bitwidth.W) // TODO magic number
+  val acc_bank_id = UInt(2.W) // TODO magic number
 }
 
-class AccumulatorReadIO[T <: Data: Arithmetic, U <: Data](n: Int, shift_width: Int, fullDataType: Vec[Vec[T]], scale_t: U) extends Bundle {
-  val req = Decoupled(new AccumulatorReadReq[U](n, shift_width, scale_t))
-  val resp = Flipped(Decoupled(new AccumulatorReadResp[T, U](fullDataType, scale_t, shift_width)))
-
+class AccumulatorReadIO[T <: Data: Arithmetic, U <: Data](n: Int, fullDataType: Vec[Vec[T]], scale_t: U) extends Bundle {
+  val req = Decoupled(new AccumulatorReadReq[T, U](n, fullDataType.head.head.cloneType, scale_t))
+  val resp = Flipped(Decoupled(new AccumulatorReadResp[T, U](fullDataType, scale_t)))
 }
 
 class AccumulatorWriteReq[T <: Data: Arithmetic](n: Int, t: Vec[Vec[T]]) extends Bundle {
@@ -36,15 +41,13 @@ class AccumulatorWriteReq[T <: Data: Arithmetic](n: Int, t: Vec[Vec[T]]) extends
   val data = t.cloneType
   val acc = Bool()
   val mask = Vec(t.getWidth / 8, Bool()) // TODO Use aligned_to here
-  // val current_waddr = Flipped(Valid(UInt(log2Ceil(n).W))) // This is the raddr that is being fed into the SRAM right now
-
 }
 
 
 class AccumulatorMemIO [T <: Data: Arithmetic, U <: Data](n: Int, t: Vec[Vec[T]], scale_t: U,
   acc_sub_banks: Int, use_shared_ext_mem: Boolean
 ) extends Bundle {
-  val read = Flipped(new AccumulatorReadIO(n, log2Ceil(t.head.head.getWidth), t, scale_t))
+  val read = Flipped(new AccumulatorReadIO(n, t, scale_t))
   val write = Flipped(Decoupled(new AccumulatorWriteReq(n, t)))
 
   val ext_mem = if (use_shared_ext_mem) Some(Vec(acc_sub_banks, new ExtMemIO)) else None
@@ -55,7 +58,6 @@ class AccumulatorMemIO [T <: Data: Arithmetic, U <: Data](n: Int, t: Vec[Vec[T]]
     val op2 = Output(t.cloneType)
     val sum = Input(t.cloneType)
   }
-
 }
 
 class AccPipe[T <: Data : Arithmetic](latency: Int, t: T)(implicit ev: Arithmetic[T]) extends Module {
@@ -98,8 +100,6 @@ class AccumulatorMem[T <: Data, U <: Data](
   // to it, then we might not get the written data. We might need some kind of cooldown counter after addresses in the
   // accumulator have been written to for configurations with such small matrices
 
-  // TODO Refuse a read from an address which has only just been written to
-
   // TODO make a new aligned_to variable specifically for AccumulatorMem. We should assume that inputs are at least
   // accType.getWidth/8 aligned, because it won't make sense to do matrix additions directly in the DMA otherwise.
 
@@ -291,7 +291,7 @@ class AccumulatorMem[T <: Data, U <: Data](
     }
   }
 
-  val q = Module(new Queue(new AccumulatorReadResp(t, scale_t, log2Ceil(t.head.head.getWidth)),  1, true, true))
+  val q = Module(new Queue(new AccumulatorReadResp(t, scale_t),  1, true, true))
   q.io.enq.bits.data := rdata_for_read_resp
 
   if (is_dummy) {
@@ -300,7 +300,10 @@ class AccumulatorMem[T <: Data, U <: Data](
   }
 
   q.io.enq.bits.scale := RegNext(io.read.req.bits.scale)
-  q.io.enq.bits.relu6_shift := RegNext(io.read.req.bits.relu6_shift)
+  q.io.enq.bits.igelu_qb := RegNext(io.read.req.bits.igelu_qb)
+  q.io.enq.bits.igelu_qc := RegNext(io.read.req.bits.igelu_qc)
+  q.io.enq.bits.iexp_qln2 := RegNext(io.read.req.bits.iexp_qln2)
+  q.io.enq.bits.iexp_qln2_inv := RegNext(io.read.req.bits.iexp_qln2_inv)
   q.io.enq.bits.act := RegNext(io.read.req.bits.act)
   q.io.enq.bits.fromDMA := RegNext(io.read.req.bits.fromDMA)
   q.io.enq.bits.acc_bank_id := DontCare
@@ -310,7 +313,10 @@ class AccumulatorMem[T <: Data, U <: Data](
 
   io.read.resp.bits.data := p.bits.data
   io.read.resp.bits.fromDMA := p.bits.fromDMA
-  io.read.resp.bits.relu6_shift := p.bits.relu6_shift
+  io.read.resp.bits.igelu_qb := p.bits.igelu_qb
+  io.read.resp.bits.igelu_qc := p.bits.igelu_qc
+  io.read.resp.bits.iexp_qln2 := p.bits.iexp_qln2
+  io.read.resp.bits.iexp_qln2_inv := p.bits.iexp_qln2_inv
   io.read.resp.bits.act := p.bits.act
   io.read.resp.bits.scale := p.bits.scale
   io.read.resp.bits.acc_bank_id := DontCare // This is set in Scratchpad
diff --git a/src/main/scala/gemmini/AccumulatorScale.scala b/src/main/scala/gemmini/AccumulatorScale.scala
index 2d23af1d..1fdd15fa 100644
--- a/src/main/scala/gemmini/AccumulatorScale.scala
+++ b/src/main/scala/gemmini/AccumulatorScale.scala
@@ -1,16 +1,16 @@
+
 package gemmini
 
 import chisel3._
 import chisel3.util._
-
 import Util._
 
-class AccumulatorReadRespWithFullData[T <: Data: Arithmetic, U <: Data](fullDataType: Vec[Vec[T]], scale_t: U, shift_width: Int) extends Bundle {
-  val resp = new AccumulatorReadResp(fullDataType, scale_t, shift_width)
+class AccumulatorReadRespWithFullData[T <: Data: Arithmetic, U <: Data](fullDataType: Vec[Vec[T]], scale_t: U)
+  extends Bundle {
+  val resp = new AccumulatorReadResp(fullDataType, scale_t)
   val full_data = fullDataType.cloneType
 }
 
-
 class AccumulatorScaleResp[T <: Data: Arithmetic](fullDataType: Vec[Vec[T]], rDataType: Vec[Vec[T]]) extends Bundle {
   val full_data = fullDataType.cloneType
   val data = rDataType.cloneType
@@ -19,26 +19,33 @@ class AccumulatorScaleResp[T <: Data: Arithmetic](fullDataType: Vec[Vec[T]], rDa
 }
 
 class AccumulatorScaleIO[T <: Data: Arithmetic, U <: Data](
-  fullDataType: Vec[Vec[T]], scale_t: U, shift_width: Int,
+  fullDataType: Vec[Vec[T]], scale_t: U,
   rDataType: Vec[Vec[T]]
 ) extends Bundle {
-  val in = Flipped(Decoupled(new AccumulatorReadResp[T,U](fullDataType, scale_t, shift_width)))
+  val in = Flipped(Decoupled(new NormalizedOutput[T,U](fullDataType, scale_t)))
   val out = Decoupled(new AccumulatorScaleResp[T](fullDataType, rDataType))
 }
 
 class AccScaleDataWithIndex[T <: Data: Arithmetic, U <: Data](t: T, u: U) extends Bundle {
-  val shift_width = log2Ceil(t.getWidth)
-
   val scale = u.cloneType
   val act = UInt(2.W) // TODO magic number
-  val relu6_shift = UInt(shift_width.W)
+  val igelu_qb = t.cloneType
+  val igelu_qc = t.cloneType
+  val iexp_qln2 = t.cloneType
+  val iexp_qln2_inv = t.cloneType
+  val mean = t.cloneType
+  val max = t.cloneType
+  val inv_stddev = u.cloneType
+  val inv_sum_exp = u.cloneType
   val data = t.cloneType
   val full_data = t.cloneType
   val id = UInt(2.W) // TODO hardcoded
   val index = UInt()
 }
 
-class AccScalePipe[T <: Data : Arithmetic, U <: Data](t: T, rDataType: Vec[Vec[T]], scale_func: (T, U) => T, scale_t: U, latency: Int, has_nonlinear_activations: Boolean)(implicit ev: Arithmetic[T]) extends Module {
+class AccScalePipe[T <: Data, U <: Data](t: T, rDataType: Vec[Vec[T]], scale_func: (T, U) => T, scale_t: U,
+                                         latency: Int, has_nonlinear_activations: Boolean, has_normalizations: Boolean)
+                                        (implicit ev: Arithmetic[T]) extends Module {
   val u = scale_t
   val io = IO(new Bundle {
     val in = Input(Valid(new AccScaleDataWithIndex(t, u)(ev)))
@@ -47,68 +54,97 @@ class AccScalePipe[T <: Data : Arithmetic, U <: Data](t: T, rDataType: Vec[Vec[T
   import ev._
   val out = WireInit(io.in)
 
-  val e_scaled = scale_func(io.in.bits.data, io.in.bits.scale)
+  val e = io.in.bits.data
+
+  val e_act = MuxCase(e, Seq(
+    (has_nonlinear_activations.B && io.in.bits.act === Activation.RELU) -> e.relu,
+    (has_nonlinear_activations.B && has_normalizations.B && io.in.bits.act === Activation.LAYERNORM) ->
+      (e - io.in.bits.mean).mult_with_reciprocal(io.in.bits.inv_stddev),
+    (has_nonlinear_activations.B && has_normalizations.B && io.in.bits.act === Activation.IGELU) ->
+      AccumulatorScale.igelu(e, io.in.bits.igelu_qb, io.in.bits.igelu_qc),
+    (has_nonlinear_activations.B && has_normalizations.B && io.in.bits.act === Activation.SOFTMAX) ->
+      scale_func(
+        AccumulatorScale.iexp(e - io.in.bits.max, io.in.bits.iexp_qln2, io.in.bits.iexp_qln2_inv, io.in.bits.igelu_qb, io.in.bits.igelu_qc),
+        io.in.bits.inv_sum_exp.asTypeOf(scale_t)),
+  ))
+
+  val e_scaled = scale_func(e_act, io.in.bits.scale)
   val e_clipped = e_scaled.clippedToWidthOf(rDataType.head.head)
-  val e_act = MuxCase(e_clipped, Seq(
-    (has_nonlinear_activations.B && io.in.bits.act === Activation.RELU) -> e_clipped.relu,
-    (has_nonlinear_activations.B && io.in.bits.act === Activation.RELU6) -> e_clipped.relu6(io.in.bits.relu6_shift)))
 
-  out.bits.data := e_act
+  out.bits.data := e_clipped
   io.out := Pipe(out, latency)
 }
 
 
-class AccumulatorScale[T <: Data: Arithmetic, U <: Data](
+class AccumulatorScale[T <: Data, U <: Data](
   fullDataType: Vec[Vec[T]], rDataType: Vec[Vec[T]],
-  scale_t: U, shift_width: Int,
+  scale_t: U,
   read_small_data: Boolean, read_full_data: Boolean,
   scale_func: (T, U) => T,
   num_scale_units: Int,
   latency: Int,
-  has_nonlinear_activations: Boolean)(implicit ev: Arithmetic[T]) extends Module {
+  has_nonlinear_activations: Boolean, has_normalizations: Boolean)(implicit ev: Arithmetic[T]) extends Module {
 
   import ev._
 
   val io = IO(new AccumulatorScaleIO[T,U](
-    fullDataType, scale_t, shift_width, rDataType
+    fullDataType, scale_t, rDataType
   )(ev))
-  val t = io.in.bits.data(0)(0).cloneType
+  val t = io.in.bits.acc_read_resp.data(0)(0).cloneType
+  val acc_read_data = io.in.bits.acc_read_resp.data
   val out = Wire(Decoupled(new AccumulatorScaleResp[T](
     fullDataType, rDataType)(ev)))
 
   if (num_scale_units == -1) {
-    val in = Wire(Decoupled(new AccumulatorReadRespWithFullData(fullDataType, scale_t, shift_width)(ev)))
+    val data = io.in.bits.acc_read_resp.data
+    val act = io.in.bits.acc_read_resp.act
+    val igelu_qb = io.in.bits.acc_read_resp.igelu_qb
+    val igelu_qc = io.in.bits.acc_read_resp.igelu_qc
+    val iexp_qln2 = io.in.bits.acc_read_resp.iexp_qln2
+    val iexp_qln2_inv = io.in.bits.acc_read_resp.iexp_qln2_inv
+    val scale = io.in.bits.acc_read_resp.scale
+
+    val activated_data = VecInit(data.map(v => VecInit(v.map { e =>
+      val e_act = MuxCase(e, Seq(
+        (has_nonlinear_activations.B && act === Activation.RELU) -> e.relu,
+        (has_nonlinear_activations.B && has_normalizations.B && act === Activation.LAYERNORM) ->
+          (e - io.in.bits.mean).mult_with_reciprocal(io.in.bits.inv_stddev),
+        (has_nonlinear_activations.B && has_normalizations.B && act === Activation.IGELU) ->
+          AccumulatorScale.igelu(e, igelu_qb, igelu_qc),
+        (has_nonlinear_activations.B && has_normalizations.B && act === Activation.SOFTMAX) ->
+          scale_func(
+            AccumulatorScale.iexp(e - io.in.bits.max, iexp_qln2, iexp_qln2_inv, igelu_qb, igelu_qc),
+            io.in.bits.inv_sum_exp.asTypeOf(scale_t)),
+      ))
+
+      val e_scaled = scale_func(e_act, scale)
+      val e_clipped = e_scaled.clippedToWidthOf(rDataType.head.head)
+
+      e_clipped
+    })))
+
+    val in = Wire(Decoupled(new AccumulatorReadRespWithFullData(fullDataType, scale_t)(ev)))
     in.valid := io.in.valid
     io.in.ready := in.ready
-    in.bits.resp := io.in.bits
-    in.bits.full_data := io.in.bits.data
-
-    val pipe_out = Pipeline(in, latency, Seq.fill(latency)((x: AccumulatorReadRespWithFullData[T,U]) => x) :+ {
-      x: AccumulatorReadRespWithFullData[T,U] =>
-      val activated_rdata = VecInit(x.resp.data.map(v => VecInit(v.map { e =>
-        val e_scaled = scale_func(e, x.resp.scale)
-        val e_clipped = e_scaled.clippedToWidthOf(rDataType.head.head)
-        val e_act = MuxCase(e_clipped, Seq(
-          (x.resp.act === Activation.RELU) -> e_clipped.relu,
-          (x.resp.act === Activation.RELU6) -> e_clipped.relu6(x.resp.relu6_shift)))
-
-        e_act
-      })))
-      val result = WireInit(x)
-      result.resp.data := activated_rdata
-      result
-    })
-    out.valid      := pipe_out.valid
+    in.bits.resp := io.in.bits.acc_read_resp
+    in.bits.full_data := acc_read_data
+    in.bits.resp.data := activated_data
+
+    val pipe_out = Pipeline(in, latency)
+
+    out.valid := pipe_out.valid
     pipe_out.ready := out.ready
     out.bits.full_data := pipe_out.bits.full_data
     out.bits.data      := pipe_out.bits.resp.data
     out.bits.fromDMA   := pipe_out.bits.resp.fromDMA
     out.bits.acc_bank_id := pipe_out.bits.resp.acc_bank_id
   } else {
-    val width = io.in.bits.data.size * io.in.bits.data(0).size
+    val width = acc_read_data.size * acc_read_data(0).size
     val nEntries = 3
-    val regs = Reg(Vec(nEntries, Valid(new AccumulatorReadResp[T,U](
-      fullDataType, scale_t, shift_width)(ev))))
+    /*val regs = Reg(Vec(nEntries, Valid(new AccumulatorReadResp[T,U](
+      fullDataType, scale_t)(ev))))*/
+    val regs = Reg(Vec(nEntries, Valid(new NormalizedOutput[T,U](
+      fullDataType, scale_t)(ev))))
     val out_regs = Reg(Vec(nEntries, new AccumulatorScaleResp[T](
       fullDataType, rDataType)(ev)))
 
@@ -124,7 +160,7 @@ class AccumulatorScale[T <: Data: Arithmetic, U <: Data](
           regs(i).valid := false.B
         }
       }
-      head_oh := (head_oh << 1) | head_oh(nEntries-1)
+      head_oh := (head_oh << 1).asUInt() | head_oh(nEntries-1)
     }
 
     io.in.ready := !Mux1H(tail_oh.asBools, regs.map(_.valid)) || (tail_oh === head_oh && out.fire)
@@ -133,13 +169,13 @@ class AccumulatorScale[T <: Data: Arithmetic, U <: Data](
         when (tail_oh(i)) {
           regs(i).valid := true.B
           regs(i).bits  := io.in.bits
-          out_regs(i).fromDMA := io.in.bits.fromDMA
-          out_regs(i).acc_bank_id := io.in.bits.acc_bank_id
+          out_regs(i).fromDMA := io.in.bits.acc_read_resp.fromDMA
+          out_regs(i).acc_bank_id := io.in.bits.acc_read_resp.acc_bank_id
           fired_masks(i).foreach(_ := false.B)
           completed_masks(i).foreach(_ := false.B)
         }
       }
-      tail_oh := (tail_oh << 1) | tail_oh(nEntries-1)
+      tail_oh := (tail_oh << 1).asUInt() | tail_oh(nEntries-1)
     }
 
     val inputs = Seq.fill(width*nEntries) { Wire(Decoupled(new AccScaleDataWithIndex(t, scale_t)(ev))) }
@@ -147,12 +183,22 @@ class AccumulatorScale[T <: Data: Arithmetic, U <: Data](
     for (i <- 0 until nEntries) {
       for (w <- 0 until width) {
         val input = inputs(i*width+w)
+
+        val acc_read_resp = regs(i).bits.acc_read_resp
+
         input.valid       := regs(i).valid && !fired_masks(i)(w)
-        input.bits.data   := regs(i).bits.data(w / io.in.bits.data(0).size)(w % io.in.bits.data(0).size)
-        input.bits.full_data := regs(i).bits.data(w / io.in.bits.data(0).size)(w % io.in.bits.data(0).size)
-        input.bits.scale  := regs(i).bits.scale
-        input.bits.act    := regs(i).bits.act
-        input.bits.relu6_shift := regs(i).bits.relu6_shift
+        input.bits.data   := acc_read_resp.data(w / acc_read_data(0).size)(w % acc_read_data(0).size)
+        input.bits.full_data := acc_read_resp.data(w / acc_read_data(0).size)(w % acc_read_data(0).size)
+        input.bits.scale  := acc_read_resp.scale
+        input.bits.act    := acc_read_resp.act
+        input.bits.igelu_qb := acc_read_resp.igelu_qb
+        input.bits.igelu_qc := acc_read_resp.igelu_qc
+        input.bits.iexp_qln2 := acc_read_resp.iexp_qln2
+        input.bits.iexp_qln2_inv := acc_read_resp.iexp_qln2_inv
+        input.bits.mean := regs(i).bits.mean
+        input.bits.max := regs(i).bits.max
+        input.bits.inv_stddev := regs(i).bits.inv_stddev
+        input.bits.inv_sum_exp := regs(i).bits.inv_sum_exp
         input.bits.id := i.U
         input.bits.index := w.U
         when (input.fire) {
@@ -171,15 +217,16 @@ class AccumulatorScale[T <: Data: Arithmetic, U <: Data](
       when (reset.asBool) {
         arbOut.valid := false.B
       }
-      val pipe = Module(new AccScalePipe(t, rDataType, scale_func, scale_t, latency, has_nonlinear_activations)(ev, ev))
+      val pipe = Module(new AccScalePipe(t, rDataType, scale_func, scale_t, latency, has_nonlinear_activations,
+        has_normalizations))
       pipe.io.in := arbOut
       val pipe_out = pipe.io.out
 
       for (j <- 0 until nEntries) {
         for (w <- 0 until width) {
           if ((j*width+w) % num_scale_units == i) {
-            val id0 = w % io.in.bits.data(0).size
-            val id1 = w / io.in.bits.data(0).size
+            val id0 = w % acc_read_data(0).size
+            val id1 = w / acc_read_data(0).size
             when (pipe_out.fire && pipe_out.bits.id === j.U && pipe_out.bits.index === w.U) {
               out_regs(j).data     (id1)(id0) := pipe_out.bits.data
               out_regs(j).full_data(id1)(id0) := pipe_out.bits.full_data
@@ -205,6 +252,40 @@ class AccumulatorScale[T <: Data: Arithmetic, U <: Data](
     io.out.bits.full_data := out.bits.full_data
   else
     io.out.bits.full_data := DontCare
-
 }
 
+object AccumulatorScale {
+  def igelu[T <: Data](q: T, qb: T, qc: T)(implicit ev: Arithmetic[T]): T = {
+    import ev._
+
+    val zero = q.zero
+    val one = q.identity
+    def neg(x: T) = zero-x
+
+    val q_sign = Mux(q.zero > q, neg(one), one)
+    val q_abs = Mux(q.zero > q, neg(q), q)
+    val q_clipped = Mux(q_abs > neg(qb), neg(qb), q_abs)
+    val q_poly = qc.mac(q_clipped + qb, q_clipped + qb).withWidthOf(q)
+    val q_erf = (q_sign * q_poly).withWidthOf(q)
+    (q * (q_erf + qc)).withWidthOf(q)
+  }
+
+  def iexp[T <: Data](q: T, qln2: T, qln2_inv: T, qb: T, qc: T)(implicit ev: Arithmetic[T]): T = {
+    import ev._
+
+    val zero = q.zero
+    def neg(x: T) = zero-x
+
+    // qln2_inv needs scale to be
+    // 1 / (2 ** 16) / S
+
+    // qln2_inv / S / (2 ** 16) = 1 / ln2
+    // q * qln2_inv = x / S / ln2 * S * (2 ** 16) = x / ln2 * (2 ** 16)
+    val neg_q_iexp = neg(q)
+    val z_iexp = (neg_q_iexp * qln2_inv).asUInt().do_>>(16).asTypeOf(q) // q is non-positive
+    val qp_iexp = q.mac(z_iexp, qln2).withWidthOf(q)
+    val q_poly_iexp = qc.mac(qp_iexp + qb, qp_iexp + qb).withWidthOf(q)
+    // we dont want a rounding shift
+    (q_poly_iexp.asUInt().do_>>(z_iexp.asUInt()(5, 0))).asTypeOf(q)
+  }}
+
diff --git a/src/main/scala/gemmini/Activation.scala b/src/main/scala/gemmini/Activation.scala
index ed7df57f..1b7d94e6 100644
--- a/src/main/scala/gemmini/Activation.scala
+++ b/src/main/scala/gemmini/Activation.scala
@@ -5,5 +5,9 @@ import chisel3._
 object Activation {
   val NONE = 0.U
   val RELU = 1.U
-  val RELU6 = 2.U
+  val LAYERNORM = 2.U
+  val IGELU = 3.U
+  val SOFTMAX = 4.U
+
+  val bitwidth = 3
 }
diff --git a/src/main/scala/gemmini/Arithmetic.scala b/src/main/scala/gemmini/Arithmetic.scala
index 4f8e9343..cdd36396 100644
--- a/src/main/scala/gemmini/Arithmetic.scala
+++ b/src/main/scala/gemmini/Arithmetic.scala
@@ -32,14 +32,21 @@ abstract class ArithmeticOps[T <: Data](self: T) {
   def *(t: T): T
   def mac(m1: T, m2: T): T // Returns (m1 * m2 + self)
   def +(t: T): T
+  def -(t: T): T
   def >>(u: UInt): T // This is a rounding shift! Rounds away from 0
   def >(t: T): Bool
   def identity: T
   def withWidthOf(t: T): T
   def clippedToWidthOf(t: T): T // Like "withWidthOf", except that it saturates
   def relu: T
-  def relu6(shift: UInt): T
   def zero: T
+  def minimum: T
+
+  // Optional parameters, which only need to be defined if you want to enable various optimizations for transformers
+  def divider(denom_t: UInt): Option[(DecoupledIO[UInt], DecoupledIO[T])] = None
+  def sqrt: Option[(DecoupledIO[UInt], DecoupledIO[T])] = None
+  def reciprocal[U <: Data](u: U): Option[(DecoupledIO[UInt], DecoupledIO[U])] = None
+  def mult_with_reciprocal[U <: Data](reciprocal: U) = self
 }
 
 object Arithmetic {
@@ -48,6 +55,7 @@ object Arithmetic {
       override def *(t: UInt) = self * t
       override def mac(m1: UInt, m2: UInt) = m1 * m2 + self
       override def +(t: UInt) = self + t
+      override def -(t: UInt) = self - t
 
       override def >>(u: UInt) = {
         // The equation we use can be found here: https://riscv.github.io/documents/riscv-v-spec/#_vector_fixed_point_rounding_mode_register_vxrm
@@ -72,15 +80,10 @@ object Arithmetic {
       }
 
       override def relu: UInt = self
-      override def relu6(shift: UInt): UInt = {
-        val max6 = (6.U << shift).asUInt()
-        val maxwidth = ((1 << (self.getWidth-1))-1).U
-        val max = Mux(max6 > maxwidth, maxwidth, max6)(self.getWidth-1, 0).asUInt()
-        Mux(self < max, self, max)
-      }
 
       override def zero: UInt = 0.U
       override def identity: UInt = 1.U
+      override def minimum: UInt = 0.U
     }
   }
 
@@ -89,6 +92,7 @@ object Arithmetic {
       override def *(t: SInt) = self * t
       override def mac(m1: SInt, m2: SInt) = m1 * m2 + self
       override def +(t: SInt) = self + t
+      override def -(t: SInt) = self - t
 
       override def >>(u: UInt) = {
         // The equation we use can be found here: https://riscv.github.io/documents/riscv-v-spec/#_vector_fixed_point_rounding_mode_register_vxrm
@@ -122,15 +126,204 @@ object Arithmetic {
       }
 
       override def relu: SInt = Mux(self >= 0.S, self, 0.S)
-      override def relu6(shift: UInt): SInt = {
-        val max6 = (6.S << shift).asSInt()
-        val maxwidth = ((1 << (self.getWidth-1))-1).S
-        val max = Mux(max6 > maxwidth, maxwidth, max6)(self.getWidth-1, 0).asSInt()
-        MuxCase(self, Seq((self < 0.S) -> 0.S, (self > max) -> max))
-      }
 
       override def zero: SInt = 0.S
       override def identity: SInt = 1.S
+      override def minimum: SInt = (-(1 << (self.getWidth-1))).S
+
+      override def divider(denom_t: UInt): Option[(DecoupledIO[UInt], DecoupledIO[SInt])] = {
+        // TODO this uses a floating point divider, but we should use an integer divider instead
+
+        val input = Wire(Decoupled(denom_t.cloneType))
+        val output = Wire(Decoupled(self.cloneType))
+
+        // We translate our integer to floating-point form so that we can use the hardfloat divider
+        val expWidth = log2Up(self.getWidth) + 1
+        val sigWidth = self.getWidth
+
+        def sin_to_float(x: SInt) = {
+          val in_to_rec_fn = Module(new INToRecFN(intWidth = self.getWidth, expWidth, sigWidth))
+          in_to_rec_fn.io.signedIn := true.B
+          in_to_rec_fn.io.in := x.asUInt()
+          in_to_rec_fn.io.roundingMode := consts.round_minMag // consts.round_near_maxMag
+          in_to_rec_fn.io.detectTininess := consts.tininess_afterRounding
+
+          in_to_rec_fn.io.out
+        }
+
+        def uin_to_float(x: UInt) = {
+          val in_to_rec_fn = Module(new INToRecFN(intWidth = self.getWidth, expWidth, sigWidth))
+          in_to_rec_fn.io.signedIn := false.B
+          in_to_rec_fn.io.in := x
+          in_to_rec_fn.io.roundingMode := consts.round_minMag // consts.round_near_maxMag
+          in_to_rec_fn.io.detectTininess := consts.tininess_afterRounding
+
+          in_to_rec_fn.io.out
+        }
+
+        def float_to_in(x: UInt) = {
+          val rec_fn_to_in = Module(new RecFNToIN(expWidth = expWidth, sigWidth, self.getWidth))
+          rec_fn_to_in.io.signedOut := true.B
+          rec_fn_to_in.io.in := x
+          rec_fn_to_in.io.roundingMode := consts.round_minMag // consts.round_near_maxMag
+
+          rec_fn_to_in.io.out.asSInt()
+        }
+
+        val self_rec = sin_to_float(self)
+        val denom_rec = uin_to_float(input.bits)
+
+        // Instantiate the hardloat divider
+        val divider = Module(new DivSqrtRecFN_small(expWidth, sigWidth, 0))
+
+        input.ready := divider.io.inReady
+        divider.io.inValid := input.valid
+        divider.io.sqrtOp := false.B
+        divider.io.a := self_rec
+        divider.io.b := denom_rec
+        divider.io.roundingMode := consts.round_minMag
+        divider.io.detectTininess := consts.tininess_afterRounding
+
+        output.valid := divider.io.outValid_div
+        output.bits := float_to_in(divider.io.out)
+
+        assert(!output.valid || output.ready)
+
+        Some((input, output))
+      }
+
+      override def sqrt: Option[(DecoupledIO[UInt], DecoupledIO[SInt])] = {
+        // TODO this uses a floating point divider, but we should use an integer divider instead
+
+        val input = Wire(Decoupled(UInt(0.W)))
+        val output = Wire(Decoupled(self.cloneType))
+
+        input.bits := DontCare
+
+        // We translate our integer to floating-point form so that we can use the hardfloat divider
+        val expWidth = log2Up(self.getWidth) + 1
+        val sigWidth = self.getWidth
+
+        def in_to_float(x: SInt) = {
+          val in_to_rec_fn = Module(new INToRecFN(intWidth = self.getWidth, expWidth, sigWidth))
+          in_to_rec_fn.io.signedIn := true.B
+          in_to_rec_fn.io.in := x.asUInt()
+          in_to_rec_fn.io.roundingMode := consts.round_minMag // consts.round_near_maxMag
+          in_to_rec_fn.io.detectTininess := consts.tininess_afterRounding
+
+          in_to_rec_fn.io.out
+        }
+
+        def float_to_in(x: UInt) = {
+          val rec_fn_to_in = Module(new RecFNToIN(expWidth = expWidth, sigWidth, self.getWidth))
+          rec_fn_to_in.io.signedOut := true.B
+          rec_fn_to_in.io.in := x
+          rec_fn_to_in.io.roundingMode := consts.round_minMag // consts.round_near_maxMag
+
+          rec_fn_to_in.io.out.asSInt()
+        }
+
+        val self_rec = in_to_float(self)
+
+        // Instantiate the hardloat sqrt
+        val sqrter = Module(new DivSqrtRecFN_small(expWidth, sigWidth, 0))
+
+        input.ready := sqrter.io.inReady
+        sqrter.io.inValid := input.valid
+        sqrter.io.sqrtOp := true.B
+        sqrter.io.a := self_rec
+        sqrter.io.b := DontCare
+        sqrter.io.roundingMode := consts.round_minMag
+        sqrter.io.detectTininess := consts.tininess_afterRounding
+
+        output.valid := sqrter.io.outValid_sqrt
+        output.bits := float_to_in(sqrter.io.out)
+
+        assert(!output.valid || output.ready)
+
+        Some((input, output))
+      }
+
+      override def reciprocal[U <: Data](u: U): Option[(DecoupledIO[UInt], DecoupledIO[U])] = u match {
+        case Float(expWidth, sigWidth) =>
+          val input = Wire(Decoupled(UInt(0.W)))
+          val output = Wire(Decoupled(u.cloneType))
+
+          input.bits := DontCare
+
+          // We translate our integer to floating-point form so that we can use the hardfloat divider
+          def in_to_float(x: SInt) = {
+            val in_to_rec_fn = Module(new INToRecFN(intWidth = self.getWidth, expWidth, sigWidth))
+            in_to_rec_fn.io.signedIn := true.B
+            in_to_rec_fn.io.in := x.asUInt()
+            in_to_rec_fn.io.roundingMode := consts.round_near_even // consts.round_near_maxMag
+            in_to_rec_fn.io.detectTininess := consts.tininess_afterRounding
+
+            in_to_rec_fn.io.out
+          }
+
+          val self_rec = in_to_float(self)
+          val one_rec = in_to_float(1.S)
+
+          // Instantiate the hardloat divider
+          val divider = Module(new DivSqrtRecFN_small(expWidth, sigWidth, 0))
+
+          input.ready := divider.io.inReady
+          divider.io.inValid := input.valid
+          divider.io.sqrtOp := false.B
+          divider.io.a := one_rec
+          divider.io.b := self_rec
+          divider.io.roundingMode := consts.round_near_even
+          divider.io.detectTininess := consts.tininess_afterRounding
+
+          output.valid := divider.io.outValid_div
+          output.bits := fNFromRecFN(expWidth, sigWidth, divider.io.out).asTypeOf(u)
+
+          assert(!output.valid || output.ready)
+
+          Some((input, output))
+
+        case _ => None
+      }
+
+      override def mult_with_reciprocal[U <: Data](reciprocal: U): SInt = reciprocal match {
+        case recip @ Float(expWidth, sigWidth) =>
+          def in_to_float(x: SInt) = {
+            val in_to_rec_fn = Module(new INToRecFN(intWidth = self.getWidth, expWidth, sigWidth))
+            in_to_rec_fn.io.signedIn := true.B
+            in_to_rec_fn.io.in := x.asUInt()
+            in_to_rec_fn.io.roundingMode := consts.round_near_even // consts.round_near_maxMag
+            in_to_rec_fn.io.detectTininess := consts.tininess_afterRounding
+
+            in_to_rec_fn.io.out
+          }
+
+          def float_to_in(x: UInt) = {
+            val rec_fn_to_in = Module(new RecFNToIN(expWidth = expWidth, sigWidth, self.getWidth))
+            rec_fn_to_in.io.signedOut := true.B
+            rec_fn_to_in.io.in := x
+            rec_fn_to_in.io.roundingMode := consts.round_minMag
+
+            rec_fn_to_in.io.out.asSInt()
+          }
+
+          val self_rec = in_to_float(self)
+          val reciprocal_rec = recFNFromFN(expWidth, sigWidth, recip.bits)
+
+          // Instantiate the hardloat divider
+          val muladder = Module(new MulAddRecFN(expWidth, sigWidth))
+          muladder.io.op := 0.U
+          muladder.io.roundingMode := consts.round_near_even
+          muladder.io.detectTininess := consts.tininess_afterRounding
+
+          muladder.io.a := self_rec
+          muladder.io.b := reciprocal_rec
+          muladder.io.c := 0.U
+
+          float_to_in(muladder.io.out)
+
+        case _ => self
+      }
     }
   }
 
@@ -239,6 +432,12 @@ object Arithmetic {
         result
       }
 
+      override def -(t: Float): Float = {
+        val t_sgn = t.bits(t.getWidth-1)
+        val neg_t = Cat(~t_sgn, t.bits(t.getWidth-2,0)).asTypeOf(t)
+        self + neg_t
+      }
+
       override def >>(u: UInt): Float = {
         // Recode self
         val self_rec = recFNFromFN(self.expWidth, self.sigWidth, self.bits)
@@ -322,55 +521,9 @@ object Arithmetic {
         result
       }
 
-      override def relu6(shift: UInt): Float = {
-        // Get a constant 6 as a float
-        val in_to_rec_fn = Module(new INToRecFN(log2Up(6+1), self.expWidth, self.sigWidth))
-        in_to_rec_fn.io.signedIn := false.B
-        in_to_rec_fn.io.in := 6.U
-        in_to_rec_fn.io.roundingMode := consts.round_near_even // consts.round_near_maxMag
-        in_to_rec_fn.io.detectTininess := consts.tininess_afterRounding
-
-        val six_rec = in_to_rec_fn.io.out
-
-        // Get 2^shift as a float
-        val shift_exp = self.bias.U(self.expWidth.W) + shift
-        val shift_fn = Cat(0.U(1.W), shift_exp, 0.U((self.sigWidth-1).W))
-        val shift_rec = recFNFromFN(self.expWidth, self.sigWidth, shift_fn)
-
-        // Get 6*(2^shift) as a float
-        val muladder = Module(new MulAddRecFN(self.expWidth, self.sigWidth))
-
-        muladder.io.op := 0.U
-        muladder.io.roundingMode := consts.round_near_even // consts.round_near_maxMag
-        muladder.io.detectTininess := consts.tininess_afterRounding
-
-        muladder.io.a := six_rec
-        muladder.io.b := shift_rec
-        muladder.io.c := 0.U
-
-        val shifted_rec = muladder.io.out
-
-        // Now, compare self and 6*(2^shift) to calculate the activation function
-        val self_rec = recFNFromFN(self.expWidth, self.sigWidth, self.bits)
-        val self_raw = rawFloatFromFN(self.expWidth, self.sigWidth, self.bits)
-
-        val comparer = Module(new CompareRecFN(self.expWidth, self.sigWidth))
-        comparer.io.a := self_rec
-        comparer.io.b := shifted_rec
-        comparer.io.signaling := false.B
-
-        val larger_than_six = comparer.io.gt
-
-        val result_rec = Mux(!self_raw.isZero && self_raw.sign, 0.U,
-          Mux(larger_than_six, shifted_rec, self_rec))
-
-        val result = Wire(Float(self.expWidth, self.sigWidth))
-        result.bits := fNFromRecFN(self.expWidth, self.sigWidth, result_rec)
-        result
-      }
-
       override def zero: Float = 0.U.asTypeOf(self)
       override def identity: Float = Cat(0.U(2.W), ~(0.U((self.expWidth-1).W)), 0.U((self.sigWidth-1).W)).asTypeOf(self)
+      override def minimum: Float = Cat(1.U, ~(0.U(self.expWidth.W)), 0.U((self.sigWidth-1).W)).asTypeOf(self)
     }
   }
 
@@ -379,14 +532,15 @@ object Arithmetic {
       override def *(t: DummySInt) = self.dontCare
       override def mac(m1: DummySInt, m2: DummySInt) = self.dontCare
       override def +(t: DummySInt) = self.dontCare
+      override def -(t: DummySInt) = self.dontCare
       override def >>(t: UInt) = self.dontCare
       override def >(t: DummySInt): Bool = false.B
       override def identity = self.dontCare
       override def withWidthOf(t: DummySInt) = self.dontCare
       override def clippedToWidthOf(t: DummySInt) = self.dontCare
       override def relu = self.dontCare
-      override def relu6(shift: UInt) = self.dontCare
       override def zero = self.dontCare
+      override def minimum: DummySInt = self.dontCare
     }
   }
 }
diff --git a/src/main/scala/gemmini/Configs.scala b/src/main/scala/gemmini/Configs.scala
index 7464dc61..2a060ea9 100644
--- a/src/main/scala/gemmini/Configs.scala
+++ b/src/main/scala/gemmini/Configs.scala
@@ -146,7 +146,7 @@ object GemminiConfigs {
 
         Mux(overflow, sat, rec_fn_to_in.io.out.asTypeOf(t))
       },
-      1, Float(8, 24), -1,
+      8, Float(8, 24), -1,
       identity = "1.0",
       c_str = "({float y = ROUND_NEAR_EVEN((x) * (scale)); y > INT8_MAX ? INT8_MAX : (y < INT8_MIN ? INT8_MIN : (acc_t)y);})"
     )),
diff --git a/src/main/scala/gemmini/Controller.scala b/src/main/scala/gemmini/Controller.scala
index cf572c0c..2c15d3ea 100644
--- a/src/main/scala/gemmini/Controller.scala
+++ b/src/main/scala/gemmini/Controller.scala
@@ -399,7 +399,6 @@ class GemminiModule[T <: Data: Arithmetic, U <: Data, V <: Data]
         unrolled_cmd.ready := true.B
       }
     }
-
   }
 
   // Debugging signals
diff --git a/src/main/scala/gemmini/CustomConfigs.scala b/src/main/scala/gemmini/CustomConfigs.scala
index e1ed7199..9172e8ee 100644
--- a/src/main/scala/gemmini/CustomConfigs.scala
+++ b/src/main/scala/gemmini/CustomConfigs.scala
@@ -41,6 +41,14 @@ object GemminiCustomConfigs {
     acc_capacity = CapacityInKilobytes(128),
   )
 
+ val bertInferenceConfig = defaultConfig.copy(
+    has_training_convs = false,
+    has_max_pool =  false,
+    has_normalizations = true,
+
+    acc_capacity = CapacityInKilobytes(128),
+  )
+
   // Specify which of your custom configs you want to build here
   val customConfig = baselineInferenceConfig
 }
diff --git a/src/main/scala/gemmini/DMACommandTracker.scala b/src/main/scala/gemmini/DMACommandTracker.scala
index 3390cbdf..9d4f71e6 100644
--- a/src/main/scala/gemmini/DMACommandTracker.scala
+++ b/src/main/scala/gemmini/DMACommandTracker.scala
@@ -20,7 +20,6 @@ class DMACommandTracker[T <: Data](val nCmds: Int, val maxBytes: Int, tag_t: =>
         val tag = Input(tag_t.cloneType)
         val bytes_to_read = Input(UInt(log2Up(maxBytes+1).W))
         val cmd_id = Output(cmd_id_t.cloneType)
-
       }
 
       val bits = new BitsT(tag_t.cloneType, cmd_id_t.cloneType)
diff --git a/src/main/scala/gemmini/ExecuteController.scala b/src/main/scala/gemmini/ExecuteController.scala
index 62fc4495..514d918e 100644
--- a/src/main/scala/gemmini/ExecuteController.scala
+++ b/src/main/scala/gemmini/ExecuteController.scala
@@ -29,7 +29,7 @@ class ExecuteController[T <: Data, U <: Data, V <: Data](xLen: Int, tagWidth: In
 
     val acc = new Bundle {
       val read_req = Vec(acc_banks, Decoupled(new AccumulatorReadReq(
-          acc_bank_entries, log2Up(accType.getWidth), acc_scale_t
+          acc_bank_entries, accType, acc_scale_t
       )))
 
       val read_resp = Flipped(Vec(acc_banks, Decoupled(new AccumulatorScaleResp(
@@ -115,8 +115,7 @@ class ExecuteController[T <: Data, U <: Data, V <: Data](xLen: Int, tagWidth: In
 
   val in_shift = Reg(UInt(log2Up(accType.getWidth).W))
   val acc_scale = Reg(acc_scale_t)
-  val relu6_shift = Reg(UInt(log2Up(accType.getWidth).W))
-  val activation = if (has_nonlinear_activations) Reg(UInt(2.W)) else Activation.NONE // TODO magic number
+  val activation = if (has_nonlinear_activations) Reg(UInt(Activation.bitwidth.W)) else Activation.NONE // TODO magic number
   val a_transpose = Reg(Bool())
   val bd_transpose = Reg(Bool())
   val config_initialized = RegInit(false.B)
@@ -470,7 +469,10 @@ class ExecuteController[T <: Data, U <: Data, V <: Data](xLen: Int, tagWidth: In
       io.acc.read_req(i).valid := read_a_from_acc || read_b_from_acc || read_d_from_acc
       io.acc.read_req(i).bits.scale := acc_scale
       io.acc.read_req(i).bits.full := false.B
-      io.acc.read_req(i).bits.relu6_shift := relu6_shift
+      io.acc.read_req(i).bits.igelu_qb := DontCare
+      io.acc.read_req(i).bits.igelu_qc := DontCare
+      io.acc.read_req(i).bits.iexp_qln2 := DontCare
+      io.acc.read_req(i).bits.iexp_qln2_inv := DontCare
       io.acc.read_req(i).bits.act := activation
       io.acc.read_req(i).bits.fromDMA := false.B
       io.acc.read_req(i).bits.addr := MuxCase(a_address_rs1.acc_row() + a_fire_counter,
@@ -487,7 +489,10 @@ class ExecuteController[T <: Data, U <: Data, V <: Data](xLen: Int, tagWidth: In
       io.acc.read_req(i).valid := false.B
       io.acc.read_req(i).bits.scale := DontCare
       io.acc.read_req(i).bits.full := false.B
-      io.acc.read_req(i).bits.relu6_shift := relu6_shift
+      io.acc.read_req(i).bits.igelu_qb := DontCare
+      io.acc.read_req(i).bits.igelu_qc := DontCare
+      io.acc.read_req(i).bits.iexp_qln2 := DontCare
+      io.acc.read_req(i).bits.iexp_qln2_inv := DontCare
       io.acc.read_req(i).bits.act := DontCare
       io.acc.read_req(i).bits.fromDMA := false.B
       io.acc.read_req(i).bits.addr := DontCare
@@ -550,7 +555,6 @@ class ExecuteController[T <: Data, U <: Data, V <: Data](xLen: Int, tagWidth: In
               }
               in_shift := config_ex_rs2.in_shift
               acc_scale := rs1s(0)(xLen - 1, 32).asTypeOf(acc_scale_t) // TODO magic number
-              relu6_shift := config_ex_rs2.relu6_shift
               a_transpose := config_ex_rs1.a_transpose
               bd_transpose := config_ex_rs1.b_transpose
 
@@ -614,7 +618,6 @@ class ExecuteController[T <: Data, U <: Data, V <: Data](xLen: Int, tagWidth: In
 
           start_inputting_a := !a_should_be_fed_into_transposer
           start_inputting_b := !b_should_be_fed_into_transposer
-          start_inputting_b := true.B
 
           control_state := compute
         }
@@ -924,8 +927,7 @@ class ExecuteController[T <: Data, U <: Data, V <: Data](xLen: Int, tagWidth: In
     val activated_wdata = VecInit(mesh.io.resp.bits.data.map(v => VecInit(v.map { e =>
       val e_clipped = e.clippedToWidthOf(inputType)
       val e_act = MuxCase(e_clipped, Seq(
-        (activation === Activation.RELU) -> e_clipped.relu,
-        (activation === Activation.RELU6) -> e_clipped.relu6(relu6_shift)))
+        (activation === Activation.RELU) -> e_clipped.relu))
 
       e_act
     })))
diff --git a/src/main/scala/gemmini/GemminiConfigs.scala b/src/main/scala/gemmini/GemminiConfigs.scala
index 40c1c777..573581ec 100644
--- a/src/main/scala/gemmini/GemminiConfigs.scala
+++ b/src/main/scala/gemmini/GemminiConfigs.scala
@@ -86,7 +86,7 @@ case class GemminiArrayConfig[T <: Data : Arithmetic, U <: Data, V <: Data](
                                                                              has_max_pool: Boolean = true,
                                                                              has_nonlinear_activations: Boolean = true,
                                                                              has_dw_convs: Boolean = true,
-
+                                                                             has_normalizations: Boolean = false,
                                                                              has_first_layer_optimizations: Boolean = true,
 
                                                                              use_firesim_simulation_counters: Boolean = false,
@@ -492,6 +492,11 @@ case class GemminiArrayConfig[T <: Data : Arithmetic, U <: Data, V <: Data](
       header ++= "#define HAS_FIRST_LAYER_OPTIMIZATIONS\n\n"
     }
 
+    if (has_normalizations) {
+      header ++= "#define HAS_NORMALIZATIONS\n"
+      header ++= "#define NORM_STAT_IDS 4\n\n"
+    }
+
     header ++= s"#endif // $guard\n"
     header.toString()
   }
diff --git a/src/main/scala/gemmini/GemminiISA.scala b/src/main/scala/gemmini/GemminiISA.scala
index ea3aed12..7bca089b 100644
--- a/src/main/scala/gemmini/GemminiISA.scala
+++ b/src/main/scala/gemmini/GemminiISA.scala
@@ -38,7 +38,7 @@ object GemminiISA {
   val CONFIG_EX = 0.U
   val CONFIG_LOAD = 1.U
   val CONFIG_STORE = 2.U
-  val CONFIG_IM2COL = 3.U
+  val CONFIG_NORM = 3.U
 
   //==========================================================================
   // cisc-gemmini opcodes
@@ -107,7 +107,7 @@ object GemminiISA {
     val _unused = UInt(CONFIG_MVIN_RS1_UNUSED_WIDTH.W)
   }
 
-  val CONFIG_MVOUT_RS1_UNUSED_WIDTH = 2
+  val CONFIG_MVOUT_RS1_CMD_TYPE_WIDTH = 2
   val CONFIG_MVOUT_RS1_ACTIVATION_WIDTH = 2
   val CONFIG_MVOUT_RS1_MAX_POOLING_STRIDE_WIDTH = 2
   val CONFIG_MVOUT_RS1_MAX_POOLING_WINDOW_SIZE_WIDTH = 2
@@ -132,7 +132,7 @@ object GemminiISA {
     val pool_size = UInt(CONFIG_MVOUT_RS1_MAX_POOLING_WINDOW_SIZE_WIDTH.W)
     val pool_stride = UInt(CONFIG_MVOUT_RS1_MAX_POOLING_STRIDE_WIDTH.W)
     val activation = UInt(CONFIG_MVOUT_RS1_ACTIVATION_WIDTH.W)
-    val _unused = UInt(CONFIG_MVOUT_RS1_UNUSED_WIDTH.W)
+    val cmd_type = UInt(CONFIG_MVOUT_RS1_CMD_TYPE_WIDTH.W)
   }
 
   val CONFIG_MVOUT_RS2_ACC_SCALE_WIDTH = 32
@@ -145,6 +145,36 @@ object GemminiISA {
     val stride = UInt(stride_bits.W)
   }
 
+  val CONFIG_NORM_RS1_Q_CONST_WIDTH = 32
+  val CONFIG_NORM_RS1_SPACER1_WIDTH = 13
+  val CONFIG_NORM_RS1_Q_CONST_TYPE_WIDTH = 1
+  val CONFIG_NORM_RS1_SET_STATS_ID_ONLY_WIDTH = 1
+  val CONFIG_NORM_RS1_ACT_MSB_WIDTH = 1
+  val CONFIG_NORM_RS1_NORM_STATS_ID_WIDTH = 8
+  val CONFIG_NORM_RS1_SPACER0_WIDTH = 6
+  val CONFIG_NORM_RS1_CMD_TYPE_WIDTH = 2
+
+  class ConfigNormRs1(acc_t_bits: Int = 32) extends Bundle {
+    val q_const = UInt(acc_t_bits.W)
+    val _spacer1 = UInt(CONFIG_NORM_RS1_SPACER1_WIDTH.W)
+    val q_const_type = UInt(CONFIG_NORM_RS1_Q_CONST_TYPE_WIDTH.W)
+    val set_stats_id_only = UInt(CONFIG_NORM_RS1_SET_STATS_ID_ONLY_WIDTH.W)
+    val act_msb = UInt(CONFIG_NORM_RS1_ACT_MSB_WIDTH.W)
+    val norm_stats_id = UInt(CONFIG_NORM_RS1_NORM_STATS_ID_WIDTH.W)
+    val _spacer0 = UInt(CONFIG_NORM_RS1_SPACER0_WIDTH.W)
+    val cmd_type = UInt(CONFIG_NORM_RS1_CMD_TYPE_WIDTH.W)
+  }
+
+  val CONFIG_NORM_RS2_QC_WIDTH = 32
+  val CONFIG_NORM_RS2_QB_WIDTH = 32
+
+  class ConfigNormRs2(acc_t_bits: Int) extends Bundle {
+    val _spacer1 = UInt((CONFIG_NORM_RS2_QC_WIDTH - acc_t_bits).W)
+    val qc = UInt(acc_t_bits.W)
+    val _spacer0 = UInt((CONFIG_NORM_RS2_QB_WIDTH - acc_t_bits).W)
+    val qb = UInt(acc_t_bits.W)
+  }
+
   val CONFIG_EX_RS1_CMD_TYPE_WIDTH = 2
   val CONFIG_EX_RS1_DATAFLOW_WIDTH = 1
   val CONFIG_EX_RS1_ACTIVATION_WIDTH = 2
diff --git a/src/main/scala/gemmini/LocalAddr.scala b/src/main/scala/gemmini/LocalAddr.scala
index 92e46ffc..b53addea 100644
--- a/src/main/scala/gemmini/LocalAddr.scala
+++ b/src/main/scala/gemmini/LocalAddr.scala
@@ -21,8 +21,13 @@ class LocalAddr(sp_banks: Int, sp_bank_entries: Int, acc_banks: Int, acc_bank_en
   val is_acc_addr = Bool()
   val accumulate = Bool()
   val read_full_acc_row = Bool()
-  val garbage = UInt(((localAddrBits - maxAddrBits - 4) max 0).W)
-  val garbage_bit = if (localAddrBits - maxAddrBits >= 4) UInt(1.W) else UInt(0.W)
+  val norm_cmd = NormCmd()
+
+  private val metadata_w = is_acc_addr.getWidth + accumulate.getWidth + read_full_acc_row.getWidth + norm_cmd.getWidth
+  assert(maxAddrBits + metadata_w < 32)
+
+  val garbage = UInt(((localAddrBits - maxAddrBits - metadata_w - 1) max 0).W)
+  val garbage_bit = if (localAddrBits - maxAddrBits >= metadata_w + 1) UInt(1.W) else UInt(0.W)
   val data = UInt(maxAddrBits.W)
 
   def sp_bank(dummy: Int = 0) = if (spAddrBits == spBankRowBits) 0.U else data(spAddrBits - 1, spBankRowBits)
diff --git a/src/main/scala/gemmini/LoopConv.scala b/src/main/scala/gemmini/LoopConv.scala
index 3d7a099e..3fe28e98 100644
--- a/src/main/scala/gemmini/LoopConv.scala
+++ b/src/main/scala/gemmini/LoopConv.scala
@@ -922,7 +922,7 @@ class LoopConvSt(block_size: Int, coreMaxAddrBits: Int, large_iterator_bitwidth:
   pre_pool_config_cmd_rs1.pool_size := pool_size
   pre_pool_config_cmd_rs1.pool_stride := pool_stride
   pre_pool_config_cmd_rs1.activation := req.activation
-  pre_pool_config_cmd_rs1._unused := CONFIG_STORE
+  pre_pool_config_cmd_rs1.cmd_type := CONFIG_STORE
   pre_pool_config_cmd.rs1 := pre_pool_config_cmd_rs1.asUInt()
 
   val pre_pool_config_cmd_rs2 = Wire(config_mvout_rs2_t.cloneType)
@@ -938,7 +938,7 @@ class LoopConvSt(block_size: Int, coreMaxAddrBits: Int, large_iterator_bitwidth:
   val post_pool_config_cmd_rs1 = Wire(new ConfigMvoutRs1)
   post_pool_config_cmd_rs1 := DontCare
   post_pool_config_cmd_rs1.activation := req.activation
-  post_pool_config_cmd_rs1._unused := CONFIG_STORE
+  post_pool_config_cmd_rs1.cmd_type := CONFIG_STORE
   post_pool_config_cmd.rs1 := post_pool_config_cmd_rs1.asUInt()
 
   val post_pool_config_cmd_rs2 = Wire(config_mvout_rs2_t.cloneType)
diff --git a/src/main/scala/gemmini/LoopMatmul.scala b/src/main/scala/gemmini/LoopMatmul.scala
index 52871276..5f564000 100644
--- a/src/main/scala/gemmini/LoopMatmul.scala
+++ b/src/main/scala/gemmini/LoopMatmul.scala
@@ -488,6 +488,7 @@ class LoopMatmulStCReq(val block_size: Int, val coreMaxAddrBits: Int, val iterat
   val dram_addr = UInt(coreMaxAddrBits.W)
   val dram_stride = UInt(coreMaxAddrBits.W)
   val full_c = Bool()
+  val act = UInt(Activation.bitwidth.W)
   val addr_start = UInt(log2Up(max_acc_addr).W)
   val loop_id = UInt(log2Up(concurrent_loops).W)
 }
@@ -513,7 +514,7 @@ class LoopMatmulStC(block_size: Int, coreMaxAddrBits: Int, iterator_bitwidth: In
   })
 
   object State extends ChiselEnum {
-    val idle, st = Value
+    val idle, st, ln_config, ln_st = Value
   }
   import State._
   val state = RegInit(idle)
@@ -522,6 +523,7 @@ class LoopMatmulStC(block_size: Int, coreMaxAddrBits: Int, iterator_bitwidth: In
 
   val max_blocks = Mux(req.full_c, 1.U, Mux(req.max_j <= max_block_len.U, req.max_j, max_block_len.U))
 
+  // Non-normalization-related iterators and calculations
   val j = Reg(UInt(iterator_bitwidth.W))
   val i = Reg(UInt(iterator_bitwidth.W))
 
@@ -547,26 +549,80 @@ class LoopMatmulStC(block_size: Int, coreMaxAddrBits: Int, iterator_bitwidth: In
   mvout_cmd_rs2.local_addr := cast_to_acc_addr(mvout_cmd_rs2.local_addr, sp_addr, accumulate = false.B, read_full = req.full_c)
   mvout_cmd.rs2 := mvout_cmd_rs2.asUInt()
 
+  // Layernorm iterators and calculations
+  val ln_row = Reg(UInt(iterator_bitwidth.W))
+  val ln_cmd = Reg(UInt(iterator_bitwidth.W))
+  val ln_stat_id = Reg(UInt(iterator_bitwidth.W))
+
+  val NORM_STAT_IDS = 4 // TODO magic number
+
+  val ln_norm_cmds = VecInit(VecInit(NormCmd.SUM, NormCmd.MEAN), VecInit(NormCmd.VARIANCE, NormCmd.INV_STDDEV),
+    VecInit(NormCmd.RESET, NormCmd.RESET))
+
+  val sm_norm_cmds = VecInit(VecInit(NormCmd.MAX, NormCmd.MAX), VecInit(NormCmd.SUM_EXP, NormCmd.INV_SUM_EXP),
+    VecInit(NormCmd.RESET, NormCmd.RESET))
+
+  val ln_stat_ids = Mux(rows -& ln_row > NORM_STAT_IDS.U, NORM_STAT_IDS.U, rows -& ln_row)
+
+  val ln_r = ln_row +& ln_stat_id
+
+  val ln_sp_addr = acc_addr_start +& (i * req.max_j +& j) * block_size.U +& ln_r
+  val ln_norm_cmd = Mux(j +& max_blocks >= req.max_j,
+    Mux(req.act === Activation.LAYERNORM, ln_norm_cmds(ln_cmd)(1), sm_norm_cmds(ln_cmd)(1)),
+    Mux(req.act === Activation.LAYERNORM, ln_norm_cmds(ln_cmd)(0), sm_norm_cmds(ln_cmd)(0)))
+
+  // TODO we assume for now that full_C and layernorm aren't true at the same
+  val ln_dram_offset = ((i * req.dram_stride +& j) * block_size.U +& ln_r * req.dram_stride) * (input_w/8).U
+  val ln_dram_addr = req.dram_addr + LoopMatmul.castDramOffset(ln_dram_offset)
+
+  val ln_config_norm_rs1 = Wire(new GemminiISA.ConfigNormRs1)
+  ln_config_norm_rs1 := DontCare
+  ln_config_norm_rs1.set_stats_id_only := 1.U
+  ln_config_norm_rs1.cmd_type := CONFIG_NORM
+  ln_config_norm_rs1.norm_stats_id := ln_stat_id
+
+  val ln_config_norm = Wire(new RoCCCommand)
+  ln_config_norm := DontCare
+  ln_config_norm.inst.funct := CONFIG_CMD
+  ln_config_norm.rs1 := ln_config_norm_rs1.asUInt()
+  ln_config_norm.rs2 := DontCare
+
+  val ln_mvout_cmd = Wire(new RoCCCommand)
+  ln_mvout_cmd := DontCare
+  ln_mvout_cmd.inst.funct := STORE_CMD
+  ln_mvout_cmd.rs1 := ln_dram_addr
+
+  val ln_mvout_cmd_rs2 = Wire(mvout_rs2_t.cloneType)
+  ln_mvout_cmd_rs2 := DontCare
+  ln_mvout_cmd_rs2.num_rows := 1.U
+  ln_mvout_cmd_rs2.num_cols := cols.asUInt()
+  ln_mvout_cmd_rs2.local_addr := cast_to_acc_addr(ln_mvout_cmd_rs2.local_addr, ln_sp_addr, accumulate = false.B, read_full = req.full_c)
+  ln_mvout_cmd_rs2.local_addr.norm_cmd := ln_norm_cmd
+  ln_mvout_cmd.rs2 := ln_mvout_cmd_rs2.asUInt()
+
   io.req.ready := state === idle
   io.j := j
   io.i := i
   io.idle := state === idle
 
-  // The order here is k, j, i
-  // val ex_ahead = io.ex_completed || (io.ex_k === req.max_k - 1.U && (io.ex_j > j || (io.ex_j === j && io.ex_i > i)))
+  // The order here is k, j, i when not doing LAYERNORM or SOFTMAX
   val ex_ahead = io.ex_completed ||
-    (io.ex_k === req.max_k - 1.U &&
-      (io.ex_j >= j + blocks ||
-        ((io.ex_j === j + blocks - 1.U) && io.ex_i > i)))
+    ((req.act =/= Activation.LAYERNORM) && (req.act =/= Activation.SOFTMAX) &&
+      (io.ex_k === req.max_k - 1.U &&
+        (io.ex_j >= j + blocks ||
+          ((io.ex_j === j + blocks - 1.U) && io.ex_i > i))))
 
   io.cmd.valid := state =/= idle && !io.rob_overloaded && ex_ahead && req.dram_addr =/= 0.U
-  io.cmd.bits := mvout_cmd
+  io.cmd.bits := MuxCase(mvout_cmd, Seq(
+    (state === ln_config) -> ln_config_norm,
+    (state === ln_st) -> ln_mvout_cmd,
+  ))
 
   io.loop_id := req.loop_id
 
   when (req.dram_addr === 0.U) {
     state := idle
-  }.elsewhen (io.cmd.fire) {
+  }.elsewhen (io.cmd.fire() && state === st) {
     // The order here is k, j, i
     val next_i = floorAdd(i, 1.U, req.max_i)
     val next_j = floorAdd(j, max_blocks, req.max_j, next_i === 0.U)
@@ -577,13 +633,38 @@ class LoopMatmulStC(block_size: Int, coreMaxAddrBits: Int, iterator_bitwidth: In
     when (next_i === 0.U && next_j === 0.U) {
       state := idle
     }
+  }.elsewhen (io.cmd.fire() && state === ln_config) {
+    state := ln_st
+  }.elsewhen (io.cmd.fire() && state === ln_st) {
+    val next_j = floorAdd(j, max_blocks, req.max_j)
+    val next_stat_id = floorAdd(ln_stat_id, 1.U, ln_stat_ids, next_j === 0.U)
+    val next_cmd = floorAdd(ln_cmd, 1.U, ln_norm_cmds.size.U, next_j === 0.U && next_stat_id === 0.U)
+    val next_row = floorAdd(ln_row, NORM_STAT_IDS.U, rows, next_j === 0.U && next_stat_id === 0.U && next_cmd === 0.U)
+    val next_i = floorAdd(i, 1.U, req.max_i,
+      next_j === 0.U && next_stat_id === 0.U && next_cmd === 0.U && next_row === 0.U)
+
+    j := next_j
+    ln_stat_id := next_stat_id
+    ln_cmd := next_cmd
+    ln_row := next_row
+    i := next_i
+
+    when (next_i === 0.U && next_row === 0.U && next_cmd === 0.U && next_stat_id === 0.U && next_j === 0.U) {
+      state := idle
+    }.elsewhen (next_j === 0.U) {
+      state := ln_config
+    }
   }
 
   when (io.req.fire) {
     req := io.req.bits
-    state := st
+    state := Mux((io.req.bits.act === Activation.LAYERNORM) || (io.req.bits.act === Activation.SOFTMAX), ln_config, st)
+
     j := 0.U
     i := 0.U
+    ln_row := 0.U
+    ln_cmd := 0.U
+    ln_stat_id := 0.U
   }
 }
 
@@ -610,12 +691,12 @@ class LoopMatmulState(val iterator_bitwidth: Int, val coreMaxAddrBits: Int, val
   val a_transpose = Bool()
   val b_transpose = Bool()
 
+  val act = UInt(Activation.bitwidth.W)
+
   val low_d = Bool()
   val full_c = Bool()
   val ex_accumulate = Bool()
 
-  val weightA = UInt(8.W) // TODO magic numbers
-
   val configured = Bool()
 
   val running = Bool()
@@ -706,7 +787,7 @@ class LoopMatmul(block_size: Int, coreMaxAddrBits: Int, reservation_station_size
   val ab_loads_on_same_loop = ldA.io.loop_id === ldB.io.loop_id
   ldab_arb.io.forceA := !ab_loads_on_same_loop && ldA.io.loop_id === head_loop_id
   ldab_arb.io.forceB := !ab_loads_on_same_loop && ldB.io.loop_id === head_loop_id
-  ldab_arb.io.weightA := head_loop.weightA
+  ldab_arb.io.weightA := 0.U
   ldab_arb.io.inA_idle := ldA.io.idle
   ldab_arb.io.inB_idle := ldB.io.idle
   ldab_arb.io.inA_k := ldA.io.k
@@ -812,11 +893,11 @@ class LoopMatmul(block_size: Int, coreMaxAddrBits: Int, reservation_station_size
         loop_being_configured.ex_accumulate := cmd.bits.cmd.rs1(0)
         loop_being_configured.full_c := cmd.bits.cmd.rs1(1)
         loop_being_configured.low_d := cmd.bits.cmd.rs1(2)
+        loop_being_configured.act := cmd.bits.cmd.rs1(8+Activation.bitwidth-1, 8) // TODO magic numbers
+
         loop_being_configured.a_transpose := cmd.bits.cmd.rs2(0)
         loop_being_configured.b_transpose := cmd.bits.cmd.rs2(1)
 
-        loop_being_configured.weightA := cmd.bits.cmd.rs1(15, 8) // TODO magic numbers
-
         loop_being_configured.configured := true.B
 
         loops_configured := loops_configured + 1.U
@@ -928,6 +1009,7 @@ class LoopMatmul(block_size: Int, coreMaxAddrBits: Int, reservation_station_size
   stC.io.req.bits.dram_addr := loop_requesting_st.c_dram_addr
   stC.io.req.bits.dram_stride := loop_requesting_st.c_dram_stride
   stC.io.req.bits.full_c := loop_requesting_st.full_c
+  stC.io.req.bits.act := loop_requesting_st.act
   stC.io.req.bits.addr_start := st_c_addr_start
   stC.io.req.bits.loop_id := loop_requesting_st_id
 
diff --git a/src/main/scala/gemmini/NormCmd.scala b/src/main/scala/gemmini/NormCmd.scala
new file mode 100644
index 00000000..515fabb0
--- /dev/null
+++ b/src/main/scala/gemmini/NormCmd.scala
@@ -0,0 +1,23 @@
+
+package gemmini
+
+import chisel3._
+import chisel3.util._
+import chisel3.experimental.ChiselEnum
+
+object NormCmd extends ChiselEnum {
+  val RESET, SUM, MEAN, VARIANCE, INV_STDDEV, MAX, SUM_EXP, INV_SUM_EXP = Value
+
+  def writes_to_main_memory(cmd: Type): Bool = {
+    cmd === RESET
+  }
+
+  def non_reset_version(cmd: Type): Type = {
+    MuxCase(cmd, Seq(
+      (cmd === MEAN) -> SUM,
+      (cmd === MAX) -> MAX,
+      (cmd === INV_STDDEV) -> VARIANCE,
+      (cmd === INV_SUM_EXP) -> SUM_EXP
+    ))
+  }
+}
diff --git a/src/main/scala/gemmini/Normalizer.scala b/src/main/scala/gemmini/Normalizer.scala
new file mode 100644
index 00000000..89dca2db
--- /dev/null
+++ b/src/main/scala/gemmini/Normalizer.scala
@@ -0,0 +1,635 @@
+
+package gemmini
+
+import chisel3._
+import chisel3.experimental.ChiselEnum
+import chisel3.util._
+import gemmini.AccumulatorScale.iexp
+import hardfloat.{DivSqrtRecFN_small, INToRecFN, consts, fNFromRecFN}
+
+class NormalizedInput[T <: Data: Arithmetic, U <: Data](max_len: Int, num_stats: Int, fullDataType: Vec[Vec[T]],
+                                                        scale_t: U) extends Bundle {
+  val acc_read_resp = new AccumulatorReadResp[T,U](fullDataType, scale_t)
+  val len = UInt(log2Up(max_len + 1).W)
+  val stats_id = UInt(log2Up(num_stats).W)
+  val cmd = NormCmd()
+}
+
+class NormalizedOutput[T <: Data: Arithmetic, U <: Data](fullDataType: Vec[Vec[T]], scale_t: U) extends Bundle {
+  val acc_read_resp = new AccumulatorReadResp[T,U](fullDataType, scale_t)
+  val mean = fullDataType.head.head.cloneType
+  val max = fullDataType.head.head.cloneType
+  val inv_stddev = scale_t.cloneType
+  val inv_sum_exp = scale_t.cloneType
+}
+
+class IExpConst[T <: Data](acc_t: T) extends Bundle {
+  val qb = acc_t.cloneType
+  val qc = acc_t.cloneType
+  val qln2 = acc_t.cloneType
+  val qln2_inv = acc_t.cloneType
+}
+
+class AccumulationLanes[T <: Data](num_stats: Int, acc_t: T, n_lanes: Int, latency: Int)(implicit ev: Arithmetic[T])
+  extends Module {
+  // Each lane computes a sum, or an error-squared sum
+
+  import ev._
+
+  class LaneOutput extends Bundle {
+    val result = acc_t.cloneType
+    val stats_id = UInt(log2Up(num_stats).W)
+  }
+
+  val io = IO(new Bundle {
+    val ins = Flipped(Valid(new Bundle {
+      val len = UInt(log2Up(n_lanes+1).W)
+      val data = Vec(n_lanes, acc_t)
+      val mean = acc_t.cloneType
+      val max = acc_t.cloneType
+      val iexp_const = new IExpConst(acc_t)
+      val cmd = NormCmd()
+      val stats_id = UInt(log2Up(num_stats).W)
+    }))
+
+    val out = Valid(new LaneOutput)
+
+    val busy = Output(Bool())
+  })
+
+  val cmd = io.ins.bits.cmd
+  val mean = io.ins.bits.mean
+  val iexp_c = io.ins.bits.iexp_const
+
+  val data = io.ins.bits.data.zipWithIndex.map { case (d, i) =>
+    val iexp_result = iexp(d - io.ins.bits.max, iexp_c.qln2, iexp_c.qln2_inv, iexp_c.qb, iexp_c.qc)
+    Mux(i.U < io.ins.bits.len,
+      MuxCase(d, Seq(
+        (cmd === NormCmd.VARIANCE || cmd === NormCmd.INV_STDDEV) -> (d-mean)*(d-mean),
+        (cmd === NormCmd.SUM_EXP || cmd === NormCmd.INV_SUM_EXP) ->
+          iexp_result //iexp(d - io.ins.bits.max, iexp_c.qln2, iexp_c.qln2_inv, iexp_c.qb, iexp_c.qc)
+      )).withWidthOf(acc_t),
+      d.zero)
+  }
+
+  val result = data.reduce(_ + _)
+
+  val pipe = Module(new Pipeline[LaneOutput](new LaneOutput, latency)())
+
+  pipe.io.in.valid := io.ins.valid
+  // io.ins.ready := pipe.io.in.ready
+  pipe.io.in.bits.result := result
+  pipe.io.in.bits.stats_id := io.ins.bits.stats_id
+
+  io.out.valid := pipe.io.out.valid
+  pipe.io.out.ready := true.B
+  // pipe.io.out.ready := io.out.ready
+  io.out.bits := pipe.io.out.bits
+
+  io.busy := pipe.io.busy
+}
+
+class MaxLanes[T <: Data](num_stats: Int, acc_t: T, n_lanes: Int, latency: Int)(implicit ev: Arithmetic[T])
+  extends Module {
+  // Each lane computes a sum, or an error-squared sum
+
+  import ev._
+  import NormCmd._
+
+  class LaneOutput extends Bundle {
+    val result = acc_t.cloneType
+    val stats_id = UInt(log2Up(num_stats).W)
+  }
+
+  val io = IO(new Bundle {
+    val ins = Flipped(Valid(new Bundle {
+      val len = UInt(log2Up(n_lanes + 1).W)
+      val data = Vec(n_lanes, acc_t)
+      val stats_id = UInt(log2Up(num_stats).W)
+    }))
+
+    val out = Valid(new LaneOutput)
+
+    val busy = Output(Bool())
+  })
+
+  val data = io.ins.bits.data.zipWithIndex.map { case (d, i) =>
+    Mux(i.U < io.ins.bits.len, d.withWidthOf(acc_t), d.minimum)
+  }
+
+  val result = data.reduce({ (max, x) => Mux(x > max, x, max) })
+
+  val pipe = Module(new Pipeline[LaneOutput](new LaneOutput, latency)())
+
+  pipe.io.in.valid := io.ins.valid
+  // io.ins.ready := pipe.io.in.ready
+  pipe.io.in.bits.result := result
+  pipe.io.in.bits.stats_id := io.ins.bits.stats_id
+
+  io.out.valid := pipe.io.out.valid
+  pipe.io.out.ready := true.B
+  // pipe.io.out.ready := io.out.ready
+  io.out.bits := pipe.io.out.bits
+
+  io.busy := pipe.io.busy
+}
+
+class Normalizer[T <: Data, U <: Data](max_len: Int, num_reduce_lanes: Int, num_stats: Int, latency: Int,
+                                       fullDataType: Vec[Vec[T]], scale_t: U)
+                                      (implicit ev: Arithmetic[T]) extends Module {
+  import ev._
+  val acc_t = fullDataType.head.head.cloneType
+  val vec_size = fullDataType.flatten.size
+  val n_lanes = if (num_reduce_lanes < 0) vec_size else num_reduce_lanes
+
+  assert(isPow2(n_lanes))
+
+  val io = IO(new Bundle {
+    val in = Flipped(Decoupled(new NormalizedInput[T,U](max_len, num_stats, fullDataType, scale_t)))
+    val out = Decoupled(new NormalizedOutput(fullDataType, scale_t))
+  })
+
+  object State extends ChiselEnum {
+    // NOTE: We assume that "idle" and "output" are the first two states. We also assume that all the enums on the same
+    //   line keep the order below
+    val idle, output = Value
+    val get_sum = Value
+    val get_mean, waiting_for_mean = Value
+    val get_variance, waiting_for_variance, get_stddev, waiting_for_stddev, get_inv_stddev, waiting_for_inv_stddev = Value
+    val get_max = Value
+    val get_inv_sum_exp, waiting_for_inv_sum_exp = Value
+  }
+  import State._
+
+  // Buffers for normalization stats
+  class Stats extends Bundle {
+    val req = new NormalizedInput[T,U](max_len, num_stats, fullDataType, scale_t)
+    val state = State()
+
+    // Running state
+    val sum = acc_t.cloneType
+    val count = UInt(16.W) // TODO magic number
+    val running_max = acc_t.cloneType
+    val max = acc_t.cloneType
+
+    // Iterative state
+    val mean = acc_t.cloneType
+    val inv_stddev = acc_t.cloneType
+    val inv_sum_exp = acc_t.cloneType
+
+    val elems_left = req.len.cloneType
+
+    def vec_grouped = VecInit(req.acc_read_resp.data.flatten.grouped(n_lanes).map(v => VecInit(v)).toSeq)
+    def vec_groups_left = elems_left / n_lanes.U + (elems_left % n_lanes.U =/= 0.U)
+
+    def cmd = req.cmd
+
+    def waiting_for_lanes_to_drain =
+      (cmd === NormCmd.MEAN && (state === get_sum || state === get_mean)) ||
+        (cmd === NormCmd.INV_STDDEV && (state === get_sum || state === get_variance)) ||
+        (cmd === NormCmd.MAX && (state === get_max)) ||
+        (cmd === NormCmd.INV_SUM_EXP && (state === get_sum))
+  }
+
+  val stats = Reg(Vec(num_stats, new Stats))
+  val done_with_functional_units = Wire(Vec(num_stats, Bool()))
+  val next_states = Wire(Vec(num_stats, State()))
+
+  (stats.map(_.state) zip next_states).foreach { case (s, ns) => s := ns }
+
+  // IO
+  val in_stats_id = io.in.bits.stats_id
+  io.in.ready := (stats(in_stats_id).state === idle || done_with_functional_units(in_stats_id)) &&
+    stats.map(!_.waiting_for_lanes_to_drain).reduce(_ && _)
+
+  val out_stats_id = MuxCase((num_stats-1).U,
+    stats.zipWithIndex.map { case (s,i) => (s.state === output) -> i.U }
+  )
+
+  io.out.valid := stats(out_stats_id).state === output
+  io.out.bits.acc_read_resp := stats(out_stats_id).req.acc_read_resp
+  io.out.bits.mean := stats(out_stats_id).mean
+  io.out.bits.max := stats(out_stats_id).max
+  io.out.bits.inv_stddev := stats(out_stats_id).inv_stddev.asTypeOf(scale_t)
+  io.out.bits.inv_sum_exp := stats(out_stats_id).inv_sum_exp.asTypeOf(scale_t)
+
+  // Lanes and functional units
+  val lanes = Module(new AccumulationLanes(num_stats, acc_t, n_lanes, latency))
+  val max_lanes = Module(new MaxLanes(num_stats, acc_t, n_lanes, latency)) // TODO: change latency?
+
+  {
+    // Lanes input
+    val in_lanes_stats_id = MuxCase((num_stats-1).U,
+      stats.zipWithIndex.map { case (s,i) => (s.state === get_sum) -> i.U }
+    )
+
+    val stat = stats(in_lanes_stats_id)
+
+    val len = Mux(stat.elems_left % n_lanes.U === 0.U, n_lanes.U, stat.elems_left % n_lanes.U)
+
+    lanes.io.ins.valid := stat.state === get_sum && stat.vec_groups_left > 0.U
+    lanes.io.ins.bits.data := stat.vec_grouped(stat.vec_groups_left-1.U)
+    lanes.io.ins.bits.mean := stat.mean
+    lanes.io.ins.bits.max := stat.max
+
+    val iexp_const = Wire(new IExpConst(acc_t))
+    iexp_const.qln2 := io.in.bits.acc_read_resp.iexp_qln2.asTypeOf(iexp_const.qln2)
+    iexp_const.qln2_inv := io.in.bits.acc_read_resp.iexp_qln2_inv.asTypeOf(iexp_const.qln2_inv)
+    iexp_const.qb := io.in.bits.acc_read_resp.igelu_qb.asTypeOf(iexp_const.qb)
+    iexp_const.qc := io.in.bits.acc_read_resp.igelu_qc.asTypeOf(iexp_const.qc)
+
+    lanes.io.ins.bits.cmd := stat.cmd
+    lanes.io.ins.bits.len := len
+    lanes.io.ins.bits.stats_id := in_lanes_stats_id
+    lanes.io.ins.bits.iexp_const := iexp_const
+
+    when (lanes.io.ins.fire()) {
+      stat.elems_left := stat.elems_left - len
+    }
+  }
+
+  {
+    // Lanes output
+    val out_lanes_stats_id = lanes.io.out.bits.stats_id
+
+    val stat = stats(out_lanes_stats_id)
+
+    when (lanes.io.out.fire()) {
+      stat.sum := stat.sum + lanes.io.out.bits.result
+    }
+  }
+
+  {
+    // Max lanes input
+    val max_in_lanes_stats_id = MuxCase((num_stats-1).U,
+      stats.zipWithIndex.map { case (s,i) => (s.state === get_max) -> i.U }
+    )
+
+    val stat = stats(max_in_lanes_stats_id)
+
+    val len = Mux(stat.elems_left % n_lanes.U === 0.U, n_lanes.U, stat.elems_left % n_lanes.U)
+
+    max_lanes.io.ins.valid := stat.state === get_max && stat.vec_groups_left > 0.U
+    max_lanes.io.ins.bits.data := stat.vec_grouped(stat.vec_groups_left-1.U)
+    max_lanes.io.ins.bits.len := len
+    max_lanes.io.ins.bits.stats_id := max_in_lanes_stats_id
+
+    when (max_lanes.io.ins.fire()) {
+      stat.elems_left := stat.elems_left - len
+    }
+  }
+
+  {
+    // Max lanes output
+    val max_out_lanes_stats_id = max_lanes.io.out.bits.stats_id
+
+    val stat = stats(max_out_lanes_stats_id)
+
+    when (max_lanes.io.out.fire()) {
+      stat.running_max := Mux(max_lanes.io.out.bits.result > stat.running_max, max_lanes.io.out.bits.result, stat.running_max)
+      //stat.max := Mux(max_lanes.io.out.bits.result > stat.max, max_lanes.io.out.bits.result, stat.max)
+    }
+  }
+
+  val sum_to_divide_id = MuxCase((num_stats-1).U,
+    stats.zipWithIndex.map { case (s,i) =>
+      (s.state === get_mean || s.state === get_variance) -> i.U }
+  )
+  val sum_to_divide = stats(sum_to_divide_id).sum
+  val (divider_in, divider_out) = sum_to_divide.divider(stats.head.count).get
+
+  {
+    // Divider input
+    val stat = stats(sum_to_divide_id)
+
+    divider_in.valid := (stat.state === get_mean || stat.state === get_variance) && !lanes.io.busy
+    divider_in.bits := stat.count
+  }
+
+  {
+    // Divider output
+    val waiting_for_divide_id = MuxCase((num_stats-1).U,
+      stats.zipWithIndex.map { case (s,i) =>
+        (s.state === waiting_for_mean || s.state === waiting_for_variance) -> i.U }
+    )
+    val stat = stats(waiting_for_divide_id)
+
+    divider_out.ready := stat.state === waiting_for_mean || stat.state === waiting_for_variance
+
+    when(stat.state === waiting_for_mean) {
+      stat.mean := divider_out.bits
+    }.elsewhen(stat.state === waiting_for_variance) {
+      stat.inv_stddev := divider_out.bits
+    }
+  }
+
+  val variance_to_sqrt_id = MuxCase((num_stats-1).U,
+    stats.zipWithIndex.map { case (s,i) =>
+      (s.state === get_stddev) -> i.U }
+  )
+  val variance_to_sqrt = stats(variance_to_sqrt_id).inv_stddev
+  val (sqrt_in, sqrt_out) = variance_to_sqrt.sqrt.get
+
+  {
+    // Sqrt input
+    val stat = stats(variance_to_sqrt_id)
+
+    sqrt_in.valid := stat.state === get_stddev
+  }
+
+  {
+    // Sqrt output
+    val waiting_for_sqrt_id = MuxCase((num_stats-1).U,
+      stats.zipWithIndex.map { case (s,i) =>
+        (s.state === waiting_for_stddev) -> i.U }
+    )
+    val stat = stats(waiting_for_sqrt_id)
+
+    sqrt_out.ready := stat.state === waiting_for_stddev
+
+    // TODO this fallback for stddev === 0 only works if acc_t is an SInt
+    assert(acc_t.isInstanceOf[SInt])
+
+    when (stat.state === waiting_for_stddev) {
+      stat.inv_stddev := Mux(sqrt_out.bits.asUInt() === acc_t.zero.asUInt(),
+        1.S(acc_t.getWidth.W).asTypeOf(acc_t),
+        sqrt_out.bits
+      )
+    }
+  }
+
+  val stddev_to_inv_id = MuxCase((num_stats-1).U,
+    stats.zipWithIndex.map { case (s,i) =>
+      (s.state === get_inv_stddev) -> i.U }
+  )
+  val stddev_to_inv = stats(stddev_to_inv_id).inv_stddev
+  val (reciprocal_in, reciprocal_out) = stddev_to_inv.reciprocal(scale_t).get
+
+  {
+    // Reciprocal input
+    val stat = stats(stddev_to_inv_id)
+
+    reciprocal_in.valid := stat.state === get_inv_stddev
+    reciprocal_in.bits := DontCare
+  }
+
+  {
+    // Reciprocal output
+    val waiting_for_reciprocal_id = MuxCase((num_stats-1).U,
+      stats.zipWithIndex.map { case (s,i) =>
+        (s.state === waiting_for_inv_stddev) -> i.U }
+    )
+    val stat = stats(waiting_for_reciprocal_id)
+
+    reciprocal_out.ready := stat.state === waiting_for_inv_stddev
+
+    when (stat.state === waiting_for_inv_stddev) {
+      stat.inv_stddev := reciprocal_out.bits.asTypeOf(stat.inv_stddev)
+    }
+  }
+
+  val sum_exp_to_inv_id = MuxCase((num_stats-1).U,
+    stats.zipWithIndex.map { case (s,i) =>
+      (s.state === get_inv_sum_exp) -> i.U }
+  )
+  val sum_exp_to_inv = stats(sum_exp_to_inv_id).sum
+  val exp_divider_in = Wire(Decoupled(UInt(0.W)))
+  val exp_divider_out = Wire(Decoupled(scale_t.cloneType))
+
+  scale_t match {
+    case Float(expWidth, sigWidth) =>
+
+      exp_divider_in.bits := DontCare
+
+      // We translate our integer to floating-point form so that we can use the hardfloat divider
+      def in_to_float(x: SInt) = {
+        val in_to_rec_fn = Module(new INToRecFN(intWidth = sum_exp_to_inv.getWidth, expWidth, sigWidth))
+        in_to_rec_fn.io.signedIn := true.B
+        in_to_rec_fn.io.in := x.asUInt()
+        in_to_rec_fn.io.roundingMode := consts.round_near_even // consts.round_near_maxMag
+        in_to_rec_fn.io.detectTininess := consts.tininess_afterRounding
+
+        in_to_rec_fn.io.out
+      }
+
+      val self_rec = in_to_float(sum_exp_to_inv.asUInt().asSInt())
+      val one_rec = in_to_float(127.S) // softmax maximum is 127 for signed int8
+
+      // Instantiate the hardloat divider
+      val divider = Module(new DivSqrtRecFN_small(expWidth, sigWidth, 0))
+
+      exp_divider_in.ready := divider.io.inReady
+      divider.io.inValid := exp_divider_in.valid
+      divider.io.sqrtOp := false.B
+      divider.io.a := one_rec
+      divider.io.b := self_rec
+      divider.io.roundingMode := consts.round_near_even
+      divider.io.detectTininess := consts.tininess_afterRounding
+
+      exp_divider_out.valid := divider.io.outValid_div
+      exp_divider_out.bits := fNFromRecFN(expWidth, sigWidth, divider.io.out).asTypeOf(scale_t)
+  }
+
+
+  {
+    // Divider input
+    val stat = stats(sum_exp_to_inv_id)
+
+    exp_divider_in.valid := (stat.state === get_inv_sum_exp) && !lanes.io.busy
+    exp_divider_in.bits := sum_exp_to_inv.asUInt()
+  }
+
+  {
+    // Divider output
+    val waiting_for_divide_id = MuxCase((num_stats-1).U,
+      stats.zipWithIndex.map { case (s,i) =>
+        (s.state === waiting_for_inv_sum_exp) -> i.U }
+    )
+    val stat = stats(waiting_for_divide_id)
+
+    exp_divider_out.ready := stat.state === waiting_for_inv_sum_exp
+
+    when (stat.state === waiting_for_inv_sum_exp) {
+      stat.inv_sum_exp := exp_divider_out.bits.asTypeOf(stat.inv_sum_exp)
+    }
+  }
+
+  // State transitions
+  for (((stat, next_state), id) <- (stats zip next_states).zipWithIndex) {
+    val state = stat.state
+    val cmd = stat.cmd
+
+    val done = done_with_functional_units(id)
+
+    when (state === idle) {
+      // We have a different "when" statement below to support the case where a new row is input into the normalizer
+      next_state := idle
+      done := DontCare
+    }.elsewhen(state === output) {
+      next_state := Mux(io.out.fire() && out_stats_id === id.U, idle, state)
+      done := io.out.fire() && out_stats_id === id.U
+    }.elsewhen(state === get_max) {
+      val is_last_lane_input = stat.vec_groups_left === 0.U ||
+        (stat.vec_groups_left === 1.U &&
+          max_lanes.io.ins.bits.stats_id === id.U &&
+          max_lanes.io.ins.fire())
+
+      next_state := Mux(
+        is_last_lane_input,
+        MuxCase(state, Seq(
+          (cmd === NormCmd.MAX) -> idle,
+          (cmd === NormCmd.SUM_EXP || cmd === NormCmd.INV_SUM_EXP) -> get_sum
+        )),
+        state
+      )
+
+      done := is_last_lane_input && cmd === NormCmd.MAX
+    }.elsewhen(state === get_sum) {
+      val is_last_lane_input = stat.vec_groups_left === 0.U ||
+        (stat.vec_groups_left === 1.U &&
+          lanes.io.ins.bits.stats_id === id.U &&
+          lanes.io.ins.fire())
+
+      next_state := Mux(
+        is_last_lane_input,
+        MuxCase(state, Seq(
+          (cmd === NormCmd.SUM || cmd === NormCmd.VARIANCE || cmd === NormCmd.SUM_EXP) -> idle,
+          (cmd === NormCmd.MEAN) -> get_mean,
+          (cmd === NormCmd.INV_STDDEV) -> get_variance,
+          (cmd === NormCmd.INV_SUM_EXP) -> get_inv_sum_exp,
+        )),
+        state
+      )
+//      next_state := Mux(cmd === NormCmd.SUM || cmd === NormCmd.VARIANCE,
+//        Mux(is_last_lane_input, idle, state),
+//        Mux(is_last_lane_input,
+//          Mux(cmd === NormCmd.MEAN, get_mean, get_variance),
+//          state)
+//      )
+
+      done := is_last_lane_input && cmd =/= NormCmd.MEAN && cmd =/= NormCmd.INV_STDDEV && cmd =/= NormCmd.INV_SUM_EXP
+    }.elsewhen(state === get_mean || state === get_variance) {
+      next_state := Mux(divider_in.fire() && sum_to_divide_id === id.U, state.next, state)
+      done := false.B
+    }.elsewhen(state === waiting_for_mean) {
+      next_state := Mux(divider_out.fire(), idle, state)
+      done := divider_out.fire()
+    }.elsewhen(state === waiting_for_variance) {
+      next_state := Mux(divider_out.fire(), get_stddev, state)
+      done := false.B
+    }.elsewhen(state === get_stddev) {
+      next_state := Mux(sqrt_in.fire() && variance_to_sqrt_id === id.U, state.next, state)
+      done := false.B
+    }.elsewhen(state === waiting_for_stddev) {
+      next_state := Mux(sqrt_out.fire(), state.next, state)
+      done := false.B
+    }.elsewhen(state === get_inv_stddev) {
+      next_state := Mux(reciprocal_in.fire() && stddev_to_inv_id === id.U, state.next, state)
+      done := false.B
+    }.elsewhen(state === waiting_for_inv_stddev) {
+      next_state := Mux(reciprocal_out.fire(), idle, state)
+      done := reciprocal_out.fire()
+    }.elsewhen(state === get_inv_sum_exp) {
+      next_state := Mux(exp_divider_in.fire() && sum_exp_to_inv_id === id.U, state.next, state)
+      done := false.B
+    }.elsewhen(state === waiting_for_inv_sum_exp) {
+      next_state := Mux(exp_divider_out.fire(), idle, state)
+      done := exp_divider_out.fire()
+    }.otherwise {
+      assert(false.B, "invalid state in Normalizer")
+      next_state := DontCare
+      done := DontCare
+    }
+
+    when (io.in.fire() && in_stats_id === id.U) {
+      next_state := Mux(io.in.bits.cmd === NormCmd.RESET, output,
+        Mux(io.in.bits.cmd === NormCmd.MAX, get_max, get_sum))
+      when (io.in.bits.cmd === NormCmd.SUM_EXP) {
+        stat.max := stat.running_max
+      }
+    }
+  }
+
+  // Update stats variables
+  for (((stat, next_state), id) <- (stats zip next_states).zipWithIndex) {
+    val state = stat.state
+
+    val reset_running_state =
+      state === output ||
+        (state === get_mean && next_state =/= get_mean) ||
+        (state === get_variance && next_state =/= get_variance)
+
+    val is_input = io.in.fire() && in_stats_id === id.U
+
+    when (is_input) {
+      stat.req := io.in.bits
+      stat.count := stat.count + io.in.bits.len
+      stat.elems_left := io.in.bits.len
+    }
+
+    when(reset_running_state) {
+      stat.sum := acc_t.zero
+      stat.count := Mux(is_input, io.in.bits.len, 0.U)
+    }
+
+    when (state =/= get_inv_sum_exp && next_state === get_inv_sum_exp) {
+      stat.running_max := acc_t.minimum
+    }
+  }
+
+  dontTouch(stats)
+
+  // Assertions
+  assert(PopCount(stats.map(s => s.state === waiting_for_mean || s.state === waiting_for_variance)) <= 1.U, "we don't support pipelining the divider/sqrt-unit/inv-unit right now")
+  assert(PopCount(stats.map(_.state === waiting_for_stddev)) <= 1.U, "we don't support pipelining the divider/sqrt-unit/inv-unit right now")
+  assert(PopCount(stats.map(_.state === waiting_for_inv_stddev)) <= 1.U, "we don't support pipelining the divider/sqrt-unit/inv-unit right now")
+  assert(PopCount(stats.map(_.state === output)) <= 1.U, "multiple outputs at same time")
+  assert(acc_t.getWidth == scale_t.getWidth, "we use the same variable to hold both the variance and the inv-stddev, so we need them to see the width")
+
+  // Resets
+  when (reset.asBool()) {
+    stats.foreach(_.state := idle)
+    stats.foreach(_.sum := acc_t.zero)
+    stats.foreach(_.max := acc_t.minimum)
+    stats.foreach(_.running_max := acc_t.minimum)
+    stats.foreach(_.count := 0.U)
+    stats.foreach(_.inv_sum_exp := acc_t.zero)
+  }
+}
+
+object Normalizer {
+  def apply[T <: Data, U <: Data](is_passthru: Boolean, max_len: Int, num_reduce_lanes: Int, num_stats: Int,
+                                  latency: Int, fullDataType: Vec[Vec[T]], scale_t: U)(implicit ev: Arithmetic[T]):
+  (DecoupledIO[NormalizedInput[T,U]], DecoupledIO[NormalizedOutput[T,U]]) = {
+    if (is_passthru) {
+      passthru(max_len = max_len, num_stats = num_stats, fullDataType = fullDataType, scale_t = scale_t)
+    } else {
+      gen(max_len = max_len, num_reduce_lanes = num_reduce_lanes, num_stats = num_stats, latency = latency,
+        fullDataType = fullDataType, scale_t = scale_t)
+    }
+  }
+
+  def gen[T <: Data, U <: Data](max_len: Int, num_reduce_lanes: Int, num_stats: Int, latency: Int,
+                                  fullDataType: Vec[Vec[T]], scale_t: U)(implicit ev: Arithmetic[T]): (DecoupledIO[NormalizedInput[T,U]], DecoupledIO[NormalizedOutput[T,U]]) = {
+    val norm_unit_module = Module(new Normalizer(max_len, num_reduce_lanes, num_stats, latency, fullDataType, scale_t))
+    (norm_unit_module.io.in, norm_unit_module.io.out)
+  }
+
+  def passthru[T <: Data, U <: Data](max_len: Int, num_stats: Int, fullDataType: Vec[Vec[T]], scale_t: U)
+                                    (implicit ev: Arithmetic[T]): (DecoupledIO[NormalizedInput[T,U]], DecoupledIO[NormalizedOutput[T,U]]) = {
+
+    val norm_unit_passthru_q = Module(new Queue(new NormalizedInput[T,U](max_len, num_stats, fullDataType, scale_t), 2))
+    val norm_unit_passthru_out = Wire(Decoupled(new NormalizedOutput(fullDataType, scale_t)))
+
+    norm_unit_passthru_out.valid := norm_unit_passthru_q.io.deq.valid
+    norm_unit_passthru_out.bits.acc_read_resp := norm_unit_passthru_q.io.deq.bits.acc_read_resp
+    norm_unit_passthru_out.bits.mean := DontCare
+    norm_unit_passthru_out.bits.max := DontCare
+    norm_unit_passthru_out.bits.inv_stddev := DontCare
+    norm_unit_passthru_out.bits.inv_sum_exp := DontCare
+
+    norm_unit_passthru_q.io.deq.ready := norm_unit_passthru_out.ready
+
+    (norm_unit_passthru_q.io.enq, norm_unit_passthru_out)
+  }
+}
diff --git a/src/main/scala/gemmini/ReservationStation.scala b/src/main/scala/gemmini/ReservationStation.scala
index 8bb03415..72386aad 100644
--- a/src/main/scala/gemmini/ReservationStation.scala
+++ b/src/main/scala/gemmini/ReservationStation.scala
@@ -293,9 +293,9 @@ class ReservationStation[T <: Data : Arithmetic, U <: Data, V <: Data](config: G
     }
 
     val is_load = funct === LOAD_CMD || funct === LOAD2_CMD || funct === LOAD3_CMD || (funct === CONFIG_CMD && config_cmd_type === CONFIG_LOAD)
-    val is_store = funct === STORE_CMD || (funct === CONFIG_CMD && config_cmd_type === CONFIG_STORE)
-    val is_ex = funct === PRELOAD_CMD || funct_is_compute || (funct === CONFIG_CMD && (config_cmd_type === CONFIG_EX || config_cmd_type === CONFIG_IM2COL))
-    val is_im2col = funct === CONFIG_CMD && config_cmd_type === CONFIG_IM2COL // im2col commands are a subset of ex commands, so they still go in the ex queue
+    val is_ex = funct === PRELOAD_CMD || funct_is_compute || (funct === CONFIG_CMD && config_cmd_type === CONFIG_EX)
+    val is_store = funct === STORE_CMD || (funct === CONFIG_CMD && (config_cmd_type === CONFIG_STORE || config_cmd_type === CONFIG_NORM))
+    val is_norm = funct === CONFIG_CMD && config_cmd_type === CONFIG_NORM // normalization commands are a subset of store commands, so they still go in the store queue
 
     new_entry.q := Mux1H(Seq(
       is_load -> ldq,
@@ -364,7 +364,7 @@ class ReservationStation[T <: Data : Arithmetic, U <: Data, V <: Data](config: G
       }
 
     when (io.alloc.fire) {
-      when (new_entry.is_config && new_entry.q === exq && !is_im2col) {
+      when (new_entry.is_config && new_entry.q === exq) {
         a_stride := new_entry.cmd.cmd.rs1(31, 16) // TODO magic numbers // TODO this needs to be kept in sync with ExecuteController.scala
         c_stride := new_entry.cmd.cmd.rs2(63, 48) // TODO magic numbers // TODO this needs to be kept in sync with ExecuteController.scala
         val set_only_strides = new_entry.cmd.cmd.rs1(7) // TODO magic numbers
@@ -377,7 +377,7 @@ class ReservationStation[T <: Data : Arithmetic, U <: Data, V <: Data](config: G
         val repeat_pixels = maxOf(new_entry.cmd.cmd.rs1(8 + pixel_repeats_bits - 1, 8), 1.U) // TODO we use a default value of pixel repeats here, for backwards compatibility. However, we should deprecate and remove this default value eventually
         ld_block_strides(id) := block_stride
         ld_pixel_repeats(id) := repeat_pixels - 1.U
-      }.elsewhen(new_entry.is_config && new_entry.q === stq) {
+      }.elsewhen(new_entry.is_config && new_entry.q === stq && !is_norm) {
         val pool_stride = new_entry.cmd.cmd.rs1(5, 4) // TODO magic numbers
         pooling_is_enabled := pool_stride =/= 0.U
       }.elsewhen(funct === PRELOAD_CMD) {
diff --git a/src/main/scala/gemmini/Scratchpad.scala b/src/main/scala/gemmini/Scratchpad.scala
index 008dc990..64b66bde 100644
--- a/src/main/scala/gemmini/Scratchpad.scala
+++ b/src/main/scala/gemmini/Scratchpad.scala
@@ -1,3 +1,4 @@
+
 package gemmini
 
 import chisel3._
@@ -26,13 +27,18 @@ class ScratchpadMemReadRequest[U <: Data](local_addr_t: LocalAddr, scale_t_bits:
 
 }
 
-class ScratchpadMemWriteRequest(local_addr_t: LocalAddr, scale_t_bits: Int)
+class ScratchpadMemWriteRequest(local_addr_t: LocalAddr, acc_t_bits: Int, scale_t_bits: Int)
                               (implicit p: Parameters) extends CoreBundle {
   val vaddr = UInt(coreMaxAddrBits.W)
   val laddr = local_addr_t.cloneType
 
-  val acc_act = UInt(2.W) // TODO don't use a magic number for the width here
+  val acc_act = UInt(Activation.bitwidth.W) // TODO don't use a magic number for the width here
   val acc_scale = UInt(scale_t_bits.W)
+  val acc_igelu_qb = UInt(acc_t_bits.W)
+  val acc_igelu_qc = UInt(acc_t_bits.W)
+  val acc_iexp_qln2 = UInt(acc_t_bits.W)
+  val acc_iexp_qln2_inv = UInt(acc_t_bits.W)
+  val acc_norm_stats_id = UInt(8.W) // TODO magic number
 
   val len = UInt(16.W) // TODO don't use a magic number for the width here
   val block = UInt(8.W) // TODO don't use a magic number for the width here
@@ -58,14 +64,12 @@ class ScratchpadMemReadResponse extends Bundle {
 class ScratchpadReadMemIO[U <: Data](local_addr_t: LocalAddr, scale_t_bits: Int)(implicit p: Parameters) extends CoreBundle {
   val req = Decoupled(new ScratchpadMemReadRequest(local_addr_t, scale_t_bits))
   val resp = Flipped(Valid(new ScratchpadMemReadResponse))
-
 }
 
-class ScratchpadWriteMemIO(local_addr_t: LocalAddr, scale_t_bits: Int)
+class ScratchpadWriteMemIO(local_addr_t: LocalAddr, acc_t_bits: Int, scale_t_bits: Int)
                          (implicit p: Parameters) extends CoreBundle {
-  val req = Decoupled(new ScratchpadMemWriteRequest(local_addr_t, scale_t_bits))
+  val req = Decoupled(new ScratchpadMemWriteRequest(local_addr_t, acc_t_bits, scale_t_bits))
   val resp = Flipped(Valid(new ScratchpadMemWriteResponse))
-
 }
 
 class ScratchpadReadReq(val n: Int) extends Bundle {
@@ -203,7 +207,7 @@ class Scratchpad[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T,
       // DMA ports
       val dma = new Bundle {
         val read = Flipped(new ScratchpadReadMemIO(local_addr_t, mvin_scale_t_bits))
-        val write = Flipped(new ScratchpadWriteMemIO(local_addr_t, acc_scale_t_bits))
+        val write = Flipped(new ScratchpadWriteMemIO(local_addr_t, accType.getWidth, acc_scale_t_bits))
       }
 
       // SRAM ports
@@ -215,7 +219,7 @@ class Scratchpad[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T,
       // Accumulator ports
       val acc = new Bundle {
         val read_req = Flipped(Vec(acc_banks, Decoupled(new AccumulatorReadReq(
-          acc_bank_entries, log2Up(accType.getWidth), acc_scale_t.asInstanceOf[V]
+          acc_bank_entries, accType, acc_scale_t.asInstanceOf[V]
         ))))
         val read_resp = Vec(acc_banks, Decoupled(new AccumulatorScaleResp(
           Vec(meshColumns, Vec(tileColumns, inputType)),
@@ -242,25 +246,37 @@ class Scratchpad[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T,
     })
 
     val write_dispatch_q = Queue(io.dma.write.req)
-    write_dispatch_q.ready := false.B
-    // Write scale queue is necessary to maintain in-order requests to accumulator scale unit
+    // Write norm/scale queues are necessary to maintain in-order requests to accumulator norm/scale units
     // Writes from main SPAD just flow directly between scale_q and issue_q, while writes
     // From acc are ordered
-    val write_scale_q = Module(new Queue(new ScratchpadMemWriteRequest(local_addr_t, acc_scale_t_bits), spad_read_delay))
-    val write_issue_q = Module(new Queue(new ScratchpadMemWriteRequest(local_addr_t, acc_scale_t_bits), spad_read_delay+1, pipe=true))
+    val write_norm_q = Module(new Queue(new ScratchpadMemWriteRequest(local_addr_t, accType.getWidth, acc_scale_t_bits), spad_read_delay+2))
+    val write_scale_q = Module(new Queue(new ScratchpadMemWriteRequest(local_addr_t, accType.getWidth, acc_scale_t_bits), spad_read_delay+2))
+    val write_issue_q = Module(new Queue(new ScratchpadMemWriteRequest(local_addr_t, accType.getWidth, acc_scale_t_bits), spad_read_delay+1, pipe=true))
     val read_issue_q = Module(new Queue(new ScratchpadMemReadRequest(local_addr_t, mvin_scale_t_bits), spad_read_delay+1, pipe=true)) // TODO can't this just be a normal queue?
 
+    write_dispatch_q.ready := false.B
+
+    write_norm_q.io.enq.valid := false.B
+    write_norm_q.io.enq.bits := write_dispatch_q.bits
+    write_norm_q.io.deq.ready := false.B
+
     write_scale_q.io.enq.valid := false.B
-    write_scale_q.io.enq.bits  := write_dispatch_q.bits
+    write_scale_q.io.enq.bits  := write_norm_q.io.deq.bits
     write_scale_q.io.deq.ready := false.B
 
     write_issue_q.io.enq.valid := false.B
     write_issue_q.io.enq.bits := write_scale_q.io.deq.bits
 
-    // Garbage can immediately fire between dispatch_q and scale_q
+    // Garbage can immediately fire from dispatch_q -> norm_q
     when (write_dispatch_q.bits.laddr.is_garbage()) {
-      write_scale_q.io.enq <> write_dispatch_q
+      write_norm_q.io.enq <> write_dispatch_q
     }
+
+    // Non-acc or garbage can immediately fire between norm_q and scale_q
+    when (write_norm_q.io.deq.bits.laddr.is_garbage() || !write_norm_q.io.deq.bits.laddr.is_acc_addr) {
+      write_scale_q.io.enq <> write_norm_q.io.deq
+    }
+
     // Non-acc or garbage can immediately fire between scale_q and issue_q
     when (write_scale_q.io.deq.bits.laddr.is_garbage() || !write_scale_q.io.deq.bits.laddr.is_acc_addr) {
       write_issue_q.io.enq <> write_scale_q.io.deq
@@ -425,7 +441,7 @@ class Scratchpad[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T,
     writer.module.io.flush := io.flush
     reader.module.io.flush := io.flush
 
-    io.busy := writer.module.io.busy || reader.module.io.busy || write_issue_q.io.deq.valid || write_scale_q.io.deq.valid || write_dispatch_q.valid
+    io.busy := writer.module.io.busy || reader.module.io.busy || write_issue_q.io.deq.valid || write_norm_q.io.deq.valid || write_scale_q.io.deq.valid || write_dispatch_q.valid
 
     val spad_mems = {
       val banks = Seq.fill(sp_banks) { Module(new ScratchpadBank(
@@ -444,7 +460,7 @@ class Scratchpad[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T,
         val exread = ex_read_req.valid
 
         // TODO we tie the write dispatch queue's, and write issue queue's, ready and valid signals together here
-        val dmawrite = write_dispatch_q.valid && write_scale_q.io.enq.ready &&
+        val dmawrite = write_dispatch_q.valid && write_norm_q.io.enq.ready &&
           !write_dispatch_q.bits.laddr.is_garbage() &&
           !(bio.write.en && config.sp_singleported.B) &&
           !write_dispatch_q.bits.laddr.is_acc_addr && write_dispatch_q.bits.laddr.sp_bank() === i.U
@@ -462,7 +478,7 @@ class Scratchpad[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T,
 
           when (bio.read.req.fire) {
             write_dispatch_q.ready := true.B
-            write_scale_q.io.enq.valid := true.B
+            write_norm_q.io.enq.valid := true.B
 
             io.dma.write.resp.valid := true.B
           }
@@ -543,34 +559,73 @@ class Scratchpad[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T,
     val acc_row_t = Vec(meshColumns, Vec(tileColumns, accType))
     val spad_row_t = Vec(meshColumns, Vec(tileColumns, inputType))
 
+//    val acc_norm_unit = Module(new Normalizer(
+//      max_len = block_cols,
+//      num_reduce_lanes = -1,
+//      num_stats = 4,
+//      latency = 4,
+//      fullDataType = acc_row_t,
+//      scale_t = acc_scale_t,
+//    ))
+
+    val (acc_norm_unit_in, acc_norm_unit_out) = Normalizer(
+      is_passthru = !config.has_normalizations,
+      max_len = block_cols,
+      num_reduce_lanes = -1,
+      num_stats = 4,
+      latency = 4,
+      fullDataType = acc_row_t,
+      scale_t = acc_scale_t,
+    )
+
+    acc_norm_unit_in.valid := false.B
+    acc_norm_unit_in.bits.len := write_norm_q.io.deq.bits.len
+    acc_norm_unit_in.bits.stats_id := write_norm_q.io.deq.bits.acc_norm_stats_id
+    acc_norm_unit_in.bits.cmd := write_norm_q.io.deq.bits.laddr.norm_cmd
+    acc_norm_unit_in.bits.acc_read_resp := DontCare
+
     val acc_scale_unit = Module(new AccumulatorScale(
       acc_row_t,
       spad_row_t,
       acc_scale_t.asInstanceOf[V],
-      log2Up(accType.getWidth),
       acc_read_small_width,
       acc_read_full_width,
       acc_scale_func,
       acc_scale_num_units,
       acc_scale_latency,
       has_nonlinear_activations,
+      has_normalizations,
     ))
 
-    acc_scale_unit.io.in.valid := false.B
-    acc_scale_unit.io.in.bits  := DontCare
-    val dma_resp_ready = (
-      writer.module.io.req.ready &&
-      write_issue_q.io.deq.bits.laddr.is_acc_addr &&
-      !write_issue_q.io.deq.bits.laddr.is_garbage()
-    )
+    val acc_waiting_to_be_scaled = write_scale_q.io.deq.valid &&
+      !write_scale_q.io.deq.bits.laddr.is_garbage() &&
+      write_scale_q.io.deq.bits.laddr.is_acc_addr &&
+      write_issue_q.io.enq.ready
+
+    acc_norm_unit_out.ready := acc_scale_unit.io.in.ready && acc_waiting_to_be_scaled
+    acc_scale_unit.io.in.valid := acc_norm_unit_out.valid && acc_waiting_to_be_scaled
+    acc_scale_unit.io.in.bits  := acc_norm_unit_out.bits
+
+    when (acc_scale_unit.io.in.fire()) {
+      write_issue_q.io.enq <> write_scale_q.io.deq
+    }
+
     acc_scale_unit.io.out.ready := false.B
+
+    val dma_resp_ready =
+      writer.module.io.req.ready &&
+        write_issue_q.io.deq.bits.laddr.is_acc_addr &&
+        !write_issue_q.io.deq.bits.laddr.is_garbage()
+
     when (acc_scale_unit.io.out.bits.fromDMA && dma_resp_ready) {
+      // Send the acc-scale result into the DMA
       acc_scale_unit.io.out.ready := true.B
       writeData.valid := acc_scale_unit.io.out.valid
       writeData.bits  := acc_scale_unit.io.out.bits.data.asUInt
       fullAccWriteData := acc_scale_unit.io.out.bits.full_data.asUInt
     }
     for (i <- 0 until acc_banks) {
+      // Send the acc-sccale result to the ExController
       io.acc.read_resp(i).valid := false.B
       io.acc.read_resp(i).bits  := acc_scale_unit.io.out.bits
       when (!acc_scale_unit.io.out.bits.fromDMA && acc_scale_unit.io.out.bits.acc_bank_id === i.U) {
@@ -608,18 +663,21 @@ class Scratchpad[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T,
         val exread = ex_read_req.valid
 
         // TODO we tie the write dispatch queue's, and write issue queue's, ready and valid signals together here
-        val dmawrite = write_dispatch_q.valid && write_scale_q.io.enq.ready &&
+        val dmawrite = write_dispatch_q.valid && write_norm_q.io.enq.ready &&
           !write_dispatch_q.bits.laddr.is_garbage() &&
           write_dispatch_q.bits.laddr.is_acc_addr && write_dispatch_q.bits.laddr.acc_bank() === i.U
 
         bio.read.req.valid := exread || dmawrite
-        bio.read.req.bits.relu6_shift := ex_read_req.bits.relu6_shift
         ex_read_req.ready := bio.read.req.ready
 
         // The ExecuteController gets priority when reading from accumulator banks
         when (exread) {
           bio.read.req.bits.addr := ex_read_req.bits.addr
           bio.read.req.bits.act := ex_read_req.bits.act
+          bio.read.req.bits.igelu_qb := ex_read_req.bits.igelu_qb
+          bio.read.req.bits.igelu_qc := ex_read_req.bits.igelu_qc
+          bio.read.req.bits.iexp_qln2 := ex_read_req.bits.iexp_qln2
+          bio.read.req.bits.iexp_qln2_inv := ex_read_req.bits.iexp_qln2_inv
           bio.read.req.bits.scale := ex_read_req.bits.scale
           bio.read.req.bits.full := false.B
           bio.read.req.bits.fromDMA := false.B
@@ -627,12 +685,16 @@ class Scratchpad[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T,
           bio.read.req.bits.addr := write_dispatch_q.bits.laddr.acc_row()
           bio.read.req.bits.full := write_dispatch_q.bits.laddr.read_full_acc_row
           bio.read.req.bits.act := write_dispatch_q.bits.acc_act
+          bio.read.req.bits.igelu_qb := write_dispatch_q.bits.acc_igelu_qb.asTypeOf(bio.read.req.bits.igelu_qb)
+          bio.read.req.bits.igelu_qc := write_dispatch_q.bits.acc_igelu_qc.asTypeOf(bio.read.req.bits.igelu_qc)
+          bio.read.req.bits.iexp_qln2 := write_dispatch_q.bits.acc_iexp_qln2.asTypeOf(bio.read.req.bits.iexp_qln2)
+          bio.read.req.bits.iexp_qln2_inv := write_dispatch_q.bits.acc_iexp_qln2_inv.asTypeOf(bio.read.req.bits.iexp_qln2_inv)
           bio.read.req.bits.scale := write_dispatch_q.bits.acc_scale.asTypeOf(bio.read.req.bits.scale)
           bio.read.req.bits.fromDMA := true.B
 
           when (bio.read.req.fire) {
             write_dispatch_q.ready := true.B
-            write_scale_q.io.enq.valid := true.B
+            write_norm_q.io.enq.valid := true.B
 
             io.dma.write.resp.valid := true.B
           }
@@ -641,22 +703,24 @@ class Scratchpad[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T,
         }
         bio.read.resp.ready := false.B
 
-        when (write_scale_q.io.deq.valid &&
-              acc_scale_unit.io.in.ready &&
-              bio.read.resp.valid &&
-              write_issue_q.io.enq.ready &&
-              write_scale_q.io.deq.bits.laddr.is_acc_addr &&
-              !write_scale_q.io.deq.bits.laddr.is_garbage() &&
-              write_scale_q.io.deq.bits.laddr.acc_bank() === i.U) {
-          write_scale_q.io.deq.ready   := true.B
-          acc_scale_unit.io.in.valid := true.B
+        when (write_norm_q.io.deq.valid &&
+          acc_norm_unit_in.ready &&
+          bio.read.resp.valid &&
+          write_scale_q.io.enq.ready &&
+          write_norm_q.io.deq.bits.laddr.is_acc_addr &&
+          !write_norm_q.io.deq.bits.laddr.is_garbage() &&
+          write_norm_q.io.deq.bits.laddr.acc_bank() === i.U)
+        {
+          write_norm_q.io.deq.ready := true.B
+          acc_norm_unit_in.valid := true.B
           bio.read.resp.ready := true.B
-          write_issue_q.io.enq.valid := true.B
 
-          acc_scale_unit.io.in.bits := bio.read.resp.bits
-          acc_scale_unit.io.in.bits.acc_bank_id := i.U
-        }
+          // Some normalizer commands don't write to main memory, so they don't need to be passed on to the scaling units
+          write_scale_q.io.enq.valid := NormCmd.writes_to_main_memory(write_norm_q.io.deq.bits.laddr.norm_cmd)
 
+          acc_norm_unit_in.bits.acc_read_resp := bio.read.resp.bits
+          acc_norm_unit_in.bits.acc_read_resp.acc_bank_id := i.U
+        }
       }
 
       // Writing to the accumulator banks
@@ -682,7 +746,6 @@ class Scratchpad[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T,
 
         // We need to make sure that we don't try to return a dma read resp from both mvin_scale and mvin_scale_acc
         // at the same time. mvin_scale always gets priority in this cases
-        // val spad_last = mvin_scale_out.valid && mvin_scale_out.bits.last && !mvin_scale_out.bits.tag.is_acc
         val spad_last = mvin_scale_pixel_repeater.io.resp.valid && mvin_scale_pixel_repeater.io.resp.bits.last && !mvin_scale_pixel_repeater.io.resp.bits.tag.is_acc
 
         val dmaread = (from_mvin_scale || from_mvin_scale_acc) &&
diff --git a/src/main/scala/gemmini/StoreController.scala b/src/main/scala/gemmini/StoreController.scala
index 692a8e04..45ec459c 100644
--- a/src/main/scala/gemmini/StoreController.scala
+++ b/src/main/scala/gemmini/StoreController.scala
@@ -11,14 +11,14 @@ import midas.targetutils.PerfCounter
 
 // TODO this is almost a complete copy of LoadController. We should combine them into one class
 // TODO deal with errors when reading scratchpad responses
-class StoreController[T <: Data : Arithmetic, U <: Data, V <: Data](config: GemminiArrayConfig[T, U, V], coreMaxAddrBits: Int, local_addr_t: LocalAddr)
-                     (implicit p: Parameters) extends Module {
+class StoreController[T <: Data : Arithmetic, U <: Data, V <: Data](config: GemminiArrayConfig[T, U, V],
+                                                                    coreMaxAddrBits: Int, local_addr_t: LocalAddr)(implicit p: Parameters) extends Module {
   import config._
 
   val io = IO(new Bundle {
     val cmd = Flipped(Decoupled(new GemminiCmd(reservation_station_entries)))
 
-    val dma = new ScratchpadWriteMemIO(local_addr_t, acc_scale_t_bits)
+    val dma = new ScratchpadWriteMemIO(local_addr_t, accType.getWidth, acc_scale_t_bits)
 
     val completed = Decoupled(UInt(log2Up(reservation_station_entries).W))
 
@@ -42,7 +42,12 @@ class StoreController[T <: Data : Arithmetic, U <: Data, V <: Data](config: Gemm
   val block_cols = meshColumns * tileColumns
   val max_blocks = (dma_maxbytes / (block_cols * inputType.getWidth / 8)) max 1
 
-  val activation = Reg(UInt(GemminiISA.CONFIG_MVOUT_RS1_ACTIVATION_WIDTH.W))
+  val activation = Reg(UInt(Activation.bitwidth.W)) // TODO magic number
+  val igelu_qb = Reg(accType)
+  val igelu_qc = Reg(accType)
+  val iexp_qln2 = Reg(accType)
+  val iexp_qln2_inv = Reg(accType)
+  val norm_stats_id = Reg(UInt(8.W)) // TODO magic number
   val acc_scale = Reg(acc_scale_t)
 
   //val row_counter = RegInit(0.U(log2Ceil(block_rows).W))
@@ -87,6 +92,7 @@ class StoreController[T <: Data : Arithmetic, U <: Data, V <: Data](config: Gemm
 
   val config_mvout_rs1 = cmd.bits.cmd.rs1.asTypeOf(new ConfigMvoutRs1)
   val config_mvout_rs2 = cmd.bits.cmd.rs2.asTypeOf(new ConfigMvoutRs2(acc_scale_t_bits, 32))
+  val config_cmd_type = config_mvout_rs1.cmd_type
   val config_stride = config_mvout_rs2.stride
   val config_activation = config_mvout_rs1.activation
   val config_acc_scale = config_mvout_rs2.acc_scale
@@ -100,10 +106,22 @@ class StoreController[T <: Data : Arithmetic, U <: Data, V <: Data](config: Gemm
   val config_upad = config_mvout_rs1.upad
   val config_lpad = config_mvout_rs1.lpad
 
+  val config_norm_rs1 = cmd.bits.cmd.rs1.asTypeOf(new ConfigNormRs1(accType.getWidth))
+  val config_norm_rs2 = cmd.bits.cmd.rs2.asTypeOf(new ConfigNormRs2(accType.getWidth))
+  val config_stats_id = config_norm_rs1.norm_stats_id
+  val config_activation_msb = config_norm_rs1.act_msb
+  val config_set_stats_id_only = config_norm_rs1.set_stats_id_only
+  val config_iexp_q_const_type = config_norm_rs1.q_const_type
+  val config_iexp_q_const = config_norm_rs1.q_const
+  val config_igelu_qb = config_norm_rs2.qb
+  val config_igelu_qc = config_norm_rs2.qc
+
+  assert(config_norm_rs1.cmd_type === config_mvout_rs1.cmd_type)
+
   val mstatus = cmd.bits.cmd.status
 
   val current_vaddr = vaddr + row_counter * stride
-  val current_localaddr = localaddr + (block_counter * block_stride + row_counter)
+  val current_localaddr = WireInit(localaddr + (block_counter * block_stride + row_counter))
 
   val pool_row_addr = localaddr + (orow * pool_ocols +& ocol)
   when (orow_is_negative || ocol_is_negative || orow >= pool_orows || ocol >= pool_ocols) {
@@ -112,8 +130,9 @@ class StoreController[T <: Data : Arithmetic, U <: Data, V <: Data](config: Gemm
 
   val pool_vaddr = vaddr + (porow_counter * pool_out_dim + pocol_counter) * stride // TODO get rid of these multiplications
 
-  val DoConfig = cmd.bits.cmd.inst.funct === CONFIG_CMD
-  val DoStore = !DoConfig // TODO change this if more commands are added
+  val DoConfig = cmd.bits.cmd.inst.funct === CONFIG_CMD && config_cmd_type === CONFIG_STORE
+  val DoConfigNorm = config.has_normalizations.B && cmd.bits.cmd.inst.funct === CONFIG_CMD && config_cmd_type === CONFIG_NORM
+  val DoStore = !DoConfig && !DoConfigNorm
 
   cmd.ready := false.B
 
@@ -140,8 +159,15 @@ class StoreController[T <: Data : Arithmetic, U <: Data, V <: Data](config: Gemm
 
   io.dma.req.bits.vaddr := Mux(pooling_is_enabled || mvout_1d_enabled, pool_vaddr, current_vaddr)
   io.dma.req.bits.laddr := Mux(pooling_is_enabled, pool_row_addr, current_localaddr) //Todo: laddr for 1D?
+  io.dma.req.bits.laddr.norm_cmd := Mux(block_counter === blocks - 1.U, current_localaddr.norm_cmd,
+        NormCmd.non_reset_version(current_localaddr.norm_cmd))
 
   io.dma.req.bits.acc_act := activation
+  io.dma.req.bits.acc_igelu_qb := igelu_qb.asTypeOf(io.dma.req.bits.acc_igelu_qb)
+  io.dma.req.bits.acc_igelu_qc := igelu_qc.asTypeOf(io.dma.req.bits.acc_igelu_qc)
+  io.dma.req.bits.acc_iexp_qln2 := iexp_qln2.asTypeOf(io.dma.req.bits.acc_iexp_qln2)
+  io.dma.req.bits.acc_iexp_qln2_inv := iexp_qln2_inv.asTypeOf(io.dma.req.bits.acc_iexp_qln2_inv)
+  io.dma.req.bits.acc_norm_stats_id := norm_stats_id
   io.dma.req.bits.acc_scale := acc_scale.asTypeOf(io.dma.req.bits.acc_scale)
 
   io.dma.req.bits.len := Mux(block_counter === blocks - 1.U, ((cols - 1.U) % block_cols.U) + 1.U, block_cols.U)
@@ -221,10 +247,24 @@ class StoreController[T <: Data : Arithmetic, U <: Data, V <: Data](config: Gemm
           }
           cmd.ready := true.B
         }
-          .elsewhen(DoStore && cmd_tracker.io.alloc.fire()) {
-            val next_state = Mux(pooling_is_enabled, pooling, sending_rows)
-            control_state := Mux(io.dma.req.fire, next_state, waiting_for_dma_req_ready)
+        .elsewhen(config.has_normalizations.B && DoConfigNorm) {
+          when (!config_set_stats_id_only.asBool()) {
+            igelu_qb := config_igelu_qb.asTypeOf(igelu_qb)
+            igelu_qc := config_igelu_qc.asTypeOf(igelu_qc)
+            when(config_iexp_q_const_type === 0.U) {
+              iexp_qln2 := config_iexp_q_const.asTypeOf(iexp_qln2)
+            }.elsewhen(config_iexp_q_const_type === 1.U) {
+              iexp_qln2_inv := config_iexp_q_const.asTypeOf(iexp_qln2_inv)
+            }
+            activation := Cat(config_activation_msb, activation(1, 0)) // TODO: magic number
           }
+          norm_stats_id := config_stats_id
+          cmd.ready := true.B
+        }
+        .elsewhen(DoStore && cmd_tracker.io.alloc.fire()) {
+          val next_state = Mux(pooling_is_enabled, pooling, sending_rows)
+          control_state := Mux(io.dma.req.fire, next_state, waiting_for_dma_req_ready)
+        }
       }
     }
 
@@ -260,6 +300,17 @@ class StoreController[T <: Data : Arithmetic, U <: Data, V <: Data](config: Gemm
     }
   }
 
+  // Optimizations when features are disabled
+  if (!config.has_normalizations) {
+    current_localaddr.norm_cmd := NormCmd.RESET
+
+    igelu_qb := DontCare
+    igelu_qc := DontCare
+    iexp_qln2 := DontCare
+    iexp_qln2_inv := DontCare
+    norm_stats_id := 0.U
+  }
+
   // Performance counter
   CounterEventIO.init(io.counter)
   io.counter.connectEventSignal(CounterEvent.STORE_ACTIVE_CYCLE, control_state === sending_rows || control_state === pooling)

From ccd18b0396ad155c463248a70b58a7ff0eda9076 Mon Sep 17 00:00:00 2001
From: Hasan Genc <hngenc@berkeley.edu>
Date: Thu, 25 Aug 2022 15:21:36 -0700
Subject: [PATCH 04/64] Add parellelized testing infrastructure (#238)

Also, fix MIDAS builds
---
 CHIPYARD.hash                                 |  2 +-
 README.md                                     | 14 +++++--
 scripts/build-midas.sh                        |  2 +
 scripts/run-midas.sh                          |  3 +-
 software/gemmini-rocc-tests                   |  2 +-
 src/main/scala/gemmini/BeatMerger.scala       | 11 ++++--
 src/main/scala/gemmini/DMA.scala              | 19 +++++----
 src/main/scala/gemmini/LoadController.scala   |  2 +-
 src/main/scala/gemmini/LoopConv.scala         | 39 +++++++++++--------
 src/main/scala/gemmini/LoopMatmul.scala       |  1 +
 .../scala/gemmini/ReservationStation.scala    |  7 ++--
 src/main/scala/gemmini/StoreController.scala  |  2 +-
 src/main/scala/gemmini/ZeroWriter.scala       |  3 +-
 13 files changed, 66 insertions(+), 41 deletions(-)

diff --git a/CHIPYARD.hash b/CHIPYARD.hash
index f41949c3..b154a058 100644
--- a/CHIPYARD.hash
+++ b/CHIPYARD.hash
@@ -1 +1 @@
-117624d8eea27bafd613eec09e9b9b3e31239e08
+481398b910fa95ec88dd578c67ba358a4d83129d
diff --git a/README.md b/README.md
index b6b73b1c..77d6466d 100644
--- a/README.md
+++ b/README.md
@@ -32,20 +32,28 @@ Run these steps to install Chipyard and Spike (make sure to checkout the correct
 ```shell
 git clone https://github.com/ucb-bar/chipyard.git
 cd chipyard
-git checkout 117624d8eea27bafd613eec09e9b9b3e31239e08
+git checkout 481398b910fa95ec88dd578c67ba358a4d83129d
 ./scripts/init-submodules-no-riscv-tools.sh
 ./scripts/build-toolchains.sh esp-tools
 
 source env.sh
 
 cd generators/gemmini
-git fetch && git checkout dev && git pull origin dev
+git config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*"
+git fetch --unshallow && git checkout dev && git pull origin dev
 git submodule update
 
 cd -
 cd toolchains/esp-tools/riscv-isa-sim/build
-git fetch && git checkout 090e82c473fd28b4eb2011ffcd771ead6076faab
+git fetch --unshallow && git checkout 2ed403a70f65559a3c2a06bf724d4737edc73a23
 make && make install
+
+# The final step is only necessary if you want to run MIDAS simulations with
+# realistic DRAM models
+cd -
+cd sims/firesim
+git fetch --tags && git checkout 1.13.6
+./build-setup.sh --library --skip-validate
 ```
 
 Setting Up Gemmini
diff --git a/scripts/build-midas.sh b/scripts/build-midas.sh
index c966513c..590ced9d 100755
--- a/scripts/build-midas.sh
+++ b/scripts/build-midas.sh
@@ -53,6 +53,8 @@ if [ dram_model == "" ]; then
   echo DRAM model must be provided.
 fi
 
+export SYSLIBS=" $SYSLIBS -l:libdwarf.so -l:libelf.so -lz -lgmp "
+
 cd ../../sims/firesim/
 source sourceme-f1-manager.sh &> build.log
 
diff --git a/scripts/run-midas.sh b/scripts/run-midas.sh
index 9bae1813..806b21b1 100755
--- a/scripts/run-midas.sh
+++ b/scripts/run-midas.sh
@@ -121,5 +121,6 @@ if [ ! -f ./${simulator}${DEBUG} ]; then
 fi
 
 ./${simulator}${DEBUG} ${PK} ${full_binary_path} ${waveform_flag} \
-    +vcs+initreg+0 +vcs+initmem+0 +fesvr-step-size=128 +mm_relaxFunctionalModel_0=0 +mm_openPagePolicy_0=1 +mm_backendLatency_0=2 +mm_schedulerWindowSize_0=8 +mm_transactionQueueDepth_0=8 +mm_dramTimings_tAL_0=0 +mm_dramTimings_tCAS_0=14 +mm_dramTimings_tCMD_0=1 +mm_dramTimings_tCWD_0=10 +mm_dramTimings_tCCD_0=4 +mm_dramTimings_tFAW_0=25 +mm_dramTimings_tRAS_0=33 +mm_dramTimings_tREFI_0=7800 +mm_dramTimings_tRC_0=47 +mm_dramTimings_tRCD_0=14 +mm_dramTimings_tRFC_0=160 +mm_dramTimings_tRRD_0=8 +mm_dramTimings_tRP_0=14 +mm_dramTimings_tRTP_0=8 +mm_dramTimings_tRTRS_0=2 +mm_dramTimings_tWR_0=15 +mm_dramTimings_tWTR_0=8 +mm_rowAddr_offset_0=18 +mm_rowAddr_mask_0=65535 +mm_rankAddr_offset_0=16 +mm_rankAddr_mask_0=3 +mm_bankAddr_offset_0=13 +mm_bankAddr_mask_0=7 +mm_llc_wayBits_0=3 +mm_llc_setBits_0=12 +mm_llc_blockBits_0=7 +mm_llc_activeMSHRs_0=8 +shmemportname0=0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 +macaddr0=00:00:00:00:00:02 +niclog0=niclog0 +linklatency0=6405 +netbw0=100 +netburst0=8 +nic-loopback0 +tracefile=TRACEFILE +blkdev-in-mem0=128 +blkdev-log0=blkdev-log0 +autocounter-readrate=1000 +autocounter-filename=AUTOCOUNTERFILE +dramsim +max-cycles=100000000 \
+    +vcs+initreg+0 +vcs+initmem+0 +fesvr-step-size=128 +mm_relaxFunctionalModel_0=0 +mm_openPagePolicy_0=1 +mm_backendLatency_0=2 +mm_schedulerWindowSize_0=8 +mm_transactionQueueDepth_0=8 +mm_dramTimings_tAL_0=0 +mm_dramTimings_tCAS_0=14 +mm_dramTimings_tCMD_0=1 +mm_dramTimings_tCWD_0=10 +mm_dramTimings_tCCD_0=4 +mm_dramTimings_tFAW_0=25 +mm_dramTimings_tRAS_0=33 +mm_dramTimings_tREFI_0=7800 +mm_dramTimings_tRC_0=47 +mm_dramTimings_tRCD_0=14 +mm_dramTimings_tRFC_0=160 +mm_dramTimings_tRRD_0=8 +mm_dramTimings_tRP_0=14 +mm_dramTimings_tRTP_0=8 +mm_dramTimings_tRTRS_0=2 +mm_dramTimings_tWR_0=15 +mm_dramTimings_tWTR_0=8 +mm_rowAddr_offset_0=18 +mm_rowAddr_mask_0=65535 +mm_rankAddr_offset_0=16 +mm_rankAddr_mask_0=3 +mm_bankAddr_offset_0=13 +mm_bankAddr_mask_0=7 +mm_llc_wayBits_0=3 +mm_llc_setBits_0=12 +mm_llc_blockBits_0=7 +mm_llc_activeMSHRs_0=8 +shmemportname0=0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 +macaddr0=00:00:00:00:00:02 +niclog0=niclog0 +linklatency0=6405 +netbw0=100 +netburst0=8 +nic-loopback0 +tracefile=TRACEFILE +blkdev-in-mem0=128 +blkdev-log0=blkdev-log0 +autocounter-readrate=1000 +autocounter-filename=AUTOCOUNTERFILE +max-cycles=100000000 \
+    +dramsim +dramsim_ini_dir=/home/eecs/hngenc/chip/generators/testchipip/src/main/resources/dramsim2_ini \
     2>/dev/null
diff --git a/software/gemmini-rocc-tests b/software/gemmini-rocc-tests
index 37464740..70f8e58b 160000
--- a/software/gemmini-rocc-tests
+++ b/software/gemmini-rocc-tests
@@ -1 +1 @@
-Subproject commit 374647403d5e73543463f9f66f730aa16bc8e362
+Subproject commit 70f8e58b8113b288f0937a4777fe582437ea36ce
diff --git a/src/main/scala/gemmini/BeatMerger.scala b/src/main/scala/gemmini/BeatMerger.scala
index a845327b..a6a67dab 100644
--- a/src/main/scala/gemmini/BeatMerger.scala
+++ b/src/main/scala/gemmini/BeatMerger.scala
@@ -1,3 +1,4 @@
+
 package gemmini
 
 import chisel3._
@@ -26,7 +27,8 @@ class BeatMergerOut(val spadWidth: Int, val accWidth: Int, val spadRows: Int, va
   maxReqBytes: in bytes
   aligned_to: in bytes
  */
-class BeatMerger[U <: Data](beatBits: Int, maxShift: Int, spadWidth: Int, accWidth: Int, spadRows: Int, accRows: Int, maxReqBytes: Int, alignedTo: Int, meshRows: Int, mvin_scale_t_bits: Int, nCmds: Int)
+class BeatMerger[U <: Data](beatBits: Int, maxShift: Int, spadWidth: Int, accWidth: Int, spadRows: Int, accRows: Int,
+                            maxReqBytes: Int, alignedTo: Int, meshRows: Int, mvin_scale_t_bits: Int, nCmds: Int)
   extends Module {
   val io = IO(new Bundle {
     val req = Flipped(Decoupled(new XactTrackerEntry(maxShift, spadWidth, accWidth, spadRows, accRows, maxReqBytes, mvin_scale_t_bits, nCmds)))
@@ -75,9 +77,10 @@ class BeatMerger[U <: Data](beatBits: Int, maxShift: Int, spadWidth: Int, accWid
     val total_bytes_sent = req.bits.spad_row_offset + bytesSent
     Mux(req.bits.has_acc_bitwidth,
       // We only add "if" statements here to satisfy the Verilator linter. The code would be cleaner without the
-      // "if" condition and the "else" clause
-      if (total_bytes_sent.getWidth >= log2Up(accWidthBytes+1)) total_bytes_sent / accWidthBytes.U else 0.U,
-      if (total_bytes_sent.getWidth >= log2Up(spadWidthBytes+1)) total_bytes_sent / spadWidthBytes.U else 0.U)
+      // "if" condition and the "else" clause. Similarly, the width expansions are also there to satisfy the Verilator
+      // linter, despite making the code uglier.
+      if (total_bytes_sent.getWidth >= log2Up(accWidthBytes + 1)) total_bytes_sent / accWidthBytes.U(total_bytes_sent.getWidth.W) else 0.U,
+      if (total_bytes_sent.getWidth >= log2Up(spadWidthBytes + 1)) total_bytes_sent / spadWidthBytes.U(total_bytes_sent.getWidth.W) else 0.U)
   }
 
   io.out.bits.is_acc := req.bits.is_acc
diff --git a/src/main/scala/gemmini/DMA.scala b/src/main/scala/gemmini/DMA.scala
index 9761228f..41164f30 100644
--- a/src/main/scala/gemmini/DMA.scala
+++ b/src/main/scala/gemmini/DMA.scala
@@ -1,3 +1,4 @@
+
 package gemmini
 
 import chisel3._
@@ -120,10 +121,11 @@ class StreamReadBeat (val nXacts: Int, val beatBits: Int, val maxReqBytes: Int)
 }
 
 // TODO StreamReaderCore and StreamWriter are actually very alike. Is there some parent class they could both inherit from?
-class StreamReaderCore[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T, U, V], nXacts: Int, beatBits: Int, maxBytes: Int,
-                                  spadWidth: Int, accWidth: Int, aligned_to: Int,
-                                  spad_rows: Int, acc_rows: Int, meshRows: Int, use_tlb_register_filter: Boolean,
-                                  use_firesim_simulation_counters: Boolean)
+class StreamReaderCore[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T, U, V], nXacts: Int, beatBits: Int,
+                                                        maxBytes: Int, spadWidth: Int, accWidth: Int, aligned_to: Int,
+                                                        spad_rows: Int, acc_rows: Int, meshRows: Int,
+                                                        use_tlb_register_filter: Boolean,
+                                                        use_firesim_simulation_counters: Boolean)
                                  (implicit p: Parameters) extends LazyModule {
   val node = TLHelper.makeClientNode(
     name = "stream-reader", sourceId = IdRange(0, nXacts))
@@ -263,9 +265,10 @@ class StreamReaderCore[T <: Data, U <: Data, V <: Data](config: GemminiArrayConf
     io.reserve.entry.addr := req.spaddr + req.block_stride *
       Mux(req.has_acc_bitwidth,
         // We only add "if" statements here to satisfy the Verilator linter. The code would be cleaner without the
-        // "if" condition and the "else" clause
-        if (bytesRequested.getWidth >= log2Up(accWidthBytes+1)) bytesRequested / accWidthBytes.U else 0.U,
-        if (bytesRequested.getWidth >= log2Up(spadWidthBytes+1)) bytesRequested / spadWidthBytes.U else 0.U)
+        // "if" condition and the "else" clause. Similarly, the width expansions are also there to satisfy the Verilator
+        // linter, despite making the code uglier.
+        if (bytesRequested.getWidth >= log2Up(accWidthBytes+1)) bytesRequested / accWidthBytes.U(bytesRequested.getWidth.W) else 0.U,
+        if (bytesRequested.getWidth >= log2Up(spadWidthBytes+1)) bytesRequested / spadWidthBytes.U(bytesRequested.getWidth.W) else 0.U)
     io.reserve.entry.spad_row_offset := Mux(req.has_acc_bitwidth, bytesRequested % accWidthBytes.U, bytesRequested % spadWidthBytes.U)
 
     when (untranslated_a.fire) {
@@ -408,7 +411,7 @@ class StreamWriter[T <: Data: Arithmetic](nXacts: Int, beatBits: Int, maxBytes:
       val bytes_written = UInt(log2Up(maxBytes+1).W)
       val bytes_written_per_beat = Vec(maxBeatsPerReq, UInt(log2Up(beatBytes+1).W))
 
-      def total_beats(dummy: Int = 0) = Mux(size < beatBytes.U, 1.U, size / beatBytes.U)
+      def total_beats(dummy: Int = 0) = Mux(size < beatBytes.U, 1.U, size / beatBytes.U(size.getWidth.W)) // The width expansion is added here solely to satsify Verilator's linter
     }
 
     val smallest_write_size = aligned_to max beatBytes
diff --git a/src/main/scala/gemmini/LoadController.scala b/src/main/scala/gemmini/LoadController.scala
index db69857a..71ecf7c7 100644
--- a/src/main/scala/gemmini/LoadController.scala
+++ b/src/main/scala/gemmini/LoadController.scala
@@ -114,7 +114,7 @@ class LoadController[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig
   cmd_tracker.io.alloc.valid := control_state === waiting_for_command && cmd.valid && DoLoad
   cmd_tracker.io.alloc.bits.bytes_to_read :=
     Mux(io.dma.req.bits.has_acc_bitwidth, cols * actual_rows_read * config.accType.getWidth.U,
-      cols * actual_rows_read * config.inputType.getWidth.U) / 8.U
+      cols * actual_rows_read * config.inputType.getWidth.U) >> 3 // We replaced a very clear "/ 8.U" operation here with a ">> 3" operation, solely to satisfy Verilator's linter
   cmd_tracker.io.alloc.bits.tag.rob_id := cmd.bits.rob_id.bits
   cmd_tracker.io.request_returned.valid := io.dma.resp.fire // TODO use a bundle connect
   cmd_tracker.io.request_returned.bits.cmd_id := io.dma.resp.bits.cmd_id // TODO use a bundle connect
diff --git a/src/main/scala/gemmini/LoopConv.scala b/src/main/scala/gemmini/LoopConv.scala
index 3fe28e98..210bcade 100644
--- a/src/main/scala/gemmini/LoopConv.scala
+++ b/src/main/scala/gemmini/LoopConv.scala
@@ -1,3 +1,4 @@
+
 package gemmini
 
 import chisel3._
@@ -115,7 +116,7 @@ class LoopConvLdBias(block_size: Int, coreMaxAddrBits: Int, large_iterator_bitwi
   // Addresses
   val dram_offset = och * (acc_w/8).U
   val dram_addr = Mux(req.no_bias, 0.U, req.dram_addr + LoopConv.castDramOffset(dram_offset))
-  val spad_addr = acc_addr_start +& (och / block_size.U) * batches * orows * ocols +& b * orows * ocols +& orow * ocols +& ocol
+  val spad_addr = acc_addr_start +& (och / block_size.U(och.getWidth.W)) * batches * orows * ocols +& b * orows * ocols +& orow * ocols +& ocol
 
   // Sizes
   val I = Mux(ocols - ocol > block_size.U, block_size.U, ocols - ocol)
@@ -225,9 +226,10 @@ class LoopConvLdInputReq(val coreMaxAddrBits: Int, val large_iterator_bitwidth:
   val loop_id = UInt(log2Up(concurrent_loops).W)
 }
 
-class LoopConvLdInput(block_size: Int, coreMaxAddrBits: Int, large_iterator_bitwidth: Int, small_iterator_bitwidth: Int, tiny_iterator_bitwidth: Int, max_addr: Int, input_w: Int,
-                      max_block_len: Int, concurrent_loops: Int, latency: Int,
-                      config_mvin_rs1_t: ConfigMvinRs1, mvin_rs2_t: MvinRs2)(implicit p: Parameters) extends Module {
+class LoopConvLdInput(block_size: Int, coreMaxAddrBits: Int, large_iterator_bitwidth: Int, small_iterator_bitwidth: Int,
+                      tiny_iterator_bitwidth: Int, max_addr: Int, input_w: Int, max_block_len: Int,
+                      concurrent_loops: Int, latency: Int, config_mvin_rs1_t: ConfigMvinRs1, mvin_rs2_t: MvinRs2)
+                     (implicit p: Parameters) extends Module {
   val MVIN_SCALE_IDENTITY = 0x3f800000.U // TODO get this from configs somehow
 
   val io = IO(new Bundle {
@@ -401,9 +403,10 @@ class LoopConvLdWeightReq(val coreMaxAddrBits: Int, val large_iterator_bitwidth:
   val loop_id = UInt(log2Up(concurrent_loops).W)
 }
 
-class LoopConvLdWeight(block_size: Int, coreMaxAddrBits: Int, large_iterator_bitwidth: Int, small_iterator_bitwidth: Int, tiny_iterator_bitwidth: Int, max_addr: Int, input_w: Int,
-                       max_block_len: Int, concurrent_loops: Int, latency: Int,
-                       config_mvin_rs1_t: ConfigMvinRs1, mvin_rs2_t: MvinRs2)(implicit p: Parameters) extends Module {
+class LoopConvLdWeight(block_size: Int, coreMaxAddrBits: Int, large_iterator_bitwidth: Int,
+                       small_iterator_bitwidth: Int, tiny_iterator_bitwidth: Int, max_addr: Int, input_w: Int,
+                       max_block_len: Int, concurrent_loops: Int, latency: Int, config_mvin_rs1_t: ConfigMvinRs1,
+                       mvin_rs2_t: MvinRs2)(implicit p: Parameters) extends Module {
   val MVIN_SCALE_IDENTITY = 0x3f800000.U // TODO get this from configs somehow
 
   val io = IO(new Bundle {
@@ -460,8 +463,9 @@ class LoopConvLdWeight(block_size: Int, coreMaxAddrBits: Int, large_iterator_bit
   val dram_addr = req.dram_addr + LoopConv.castDramOffset(dram_offset)
 
   val spad_addr = Mux(req.trans_weight_0132,
-    addr_start + (kch / block_size.U) * krows * kcols * ochs + krow * kcols * ochs + kcol * ochs + och,
-    addr_start + (och / block_size.U) * krows * kcols * kchs + krow * kcols * kchs + kcol * kchs + kch)
+    // The width expansions are added here solely to prevent Verilator's "WIDTH" warnings, despite making the code uglier
+    addr_start + (kch / block_size.U(kch.getWidth.W)) * krows * kcols * ochs + krow * kcols * ochs + kcol * ochs + och,
+    addr_start + (och / block_size.U(och.getWidth.W)) * krows * kcols * kchs + krow * kcols * kchs + kcol * kchs + kch)
 
   // Sizes
   val J = Mux(req.trans_weight_0132,
@@ -646,13 +650,14 @@ class LoopConvExecute(block_size: Int, large_iterator_bitwidth: Int, small_itera
   // Addresses
   val a_addr = Mux(req.trans_input_3120,
     a_addr_start +& (b / block_size.U) * input_spad_stride +& kch * (irows >> req.downsample) * (icols >> req.downsample) +& (irow >> req.downsample) * (icols >> req.downsample) +& (icol >> req.downsample),
-    a_addr_start +& (kch / block_size.U) * input_spad_stride +& b * (irows >> req.downsample) * (icols >> req.downsample) +& (irow >> req.downsample) * (icols >> req.downsample) +& (icol >> req.downsample))
+    a_addr_start +& (kch / block_size.U(kch.getWidth.W)) * input_spad_stride +& b * (irows >> req.downsample) * (icols >> req.downsample) +& (irow >> req.downsample) * (icols >> req.downsample) +& (icol >> req.downsample))
 
   // val c_addr = Mux(ex_overwrite && krow === 0.U && kcol === 0.U && kch === 0.U, d_addr_start, c_addr_start) +&
   //   (och / block_size.U) * batches * orows * ocols +& b * orows * ocols +& orow * ocols +& ocol
 
+  // The width expansions are added here solely to prevent Verilator's "WIDTH" warnings, despite making the code uglier
   val c_addr = c_addr_start +&
-    (och / block_size.U) * batches * orows * ocols +& b * orows * ocols +& orow * ocols +& ocol
+    (och / block_size.U(och.getWidth.W)) * batches * orows * ocols +& b * orows * ocols +& orow * ocols +& ocol
 
   // val new_weights = b === 0.U && orow === 0.U && ocol === 0.U
   val new_weights = Reg(Bool())
@@ -660,8 +665,8 @@ class LoopConvExecute(block_size: Int, large_iterator_bitwidth: Int, small_itera
   val kcol_ = Mux(req.wrot180, kcols - kcol - 1.U, kcol)
 
   val b_addr = Mux(req.trans_weight_0132,
-    b_addr_start +& (kch / block_size.U) * krows * kcols * ochs +& krow_ * kcols * ochs +& kcol_ * ochs +& och,
-    b_addr_start +& (och / block_size.U) * krows * kcols * kchs +& krow_ * kcols * kchs +& kcol_ * kchs +& kch)
+    b_addr_start +& (kch / block_size.U(och.getWidth.W)) * krows * kcols * ochs +& krow_ * kcols * ochs +& kcol_ * ochs +& och,
+    b_addr_start +& (och / block_size.U(och.getWidth.W)) * krows * kcols * kchs +& krow_ * kcols * kchs +& kcol_ * kchs +& kch)
 
   class RoCCCommandWithAddr extends Bundle {
     val cmd = new RoCCCommand
@@ -877,10 +882,10 @@ class LoopConvSt(block_size: Int, coreMaxAddrBits: Int, large_iterator_bitwidth:
     ((orow*out_dim*batch_size +& ocol*batch_size +& b) * out_channels +& och) * (input_w/8).U,
     ((b*out_dim*out_dim +& orow*out_dim +& ocol) * out_channels +& och) * (input_w/8).U)
   val dram_addr = req.dram_addr + LoopConv.castDramOffset(dram_offset)
-  val spad_addr = acc_addr_start +& (och / block_size.U) * batches * orows * ocols +& b * orows * ocols +& orow * ocols +& ocol
+  val spad_addr = acc_addr_start +& (och / block_size.U(och.getWidth.W)) * batches * orows * ocols +& b * orows * ocols +& orow * ocols +& ocol
 
   val pool_dram_addr = req.dram_addr + ((b * pool_out_dim * pool_out_dim) * out_channels + och) * (input_w/8).U
-  val pool_spad_addr = acc_addr_start +& (och / block_size.U) * batches * orows * ocols +& b * orows * ocols
+  val pool_spad_addr = acc_addr_start +& (och / block_size.U(och.getWidth.W)) * batches * orows * ocols +& b * orows * ocols
 
   // Sizes
   val I = Mux(ocols - ocol > block_size.U, block_size.U, ocols - ocol)
@@ -1116,8 +1121,8 @@ class LoopConvState(val block_size: Int, val large_iterator_bitwidth: Int, val s
 
     result.ichs := kchs
 
-    result.out_channels_per_bank := result.ochs / block_size.U +& (result.ochs % block_size.U =/= 0.U)
-    result.in_channels_per_bank := result.ichs / block_size.U +& (result.ichs % block_size.U =/= 0.U)
+    result.out_channels_per_bank := result.ochs / block_size.U(result.ochs.getWidth.W) +& (result.ochs % block_size.U =/= 0.U)
+    result.in_channels_per_bank := result.ichs / block_size.U(result.ochs.getWidth.W) +& (result.ichs % block_size.U =/= 0.U)
 
     result.bias_spad_stride := batches * orows * ocols
     result.input_spad_stride := Mux(trans_input_3120,
diff --git a/src/main/scala/gemmini/LoopMatmul.scala b/src/main/scala/gemmini/LoopMatmul.scala
index 5f564000..86552d56 100644
--- a/src/main/scala/gemmini/LoopMatmul.scala
+++ b/src/main/scala/gemmini/LoopMatmul.scala
@@ -1,3 +1,4 @@
+
 package gemmini
 
 import chisel3._
diff --git a/src/main/scala/gemmini/ReservationStation.scala b/src/main/scala/gemmini/ReservationStation.scala
index 72386aad..cfd60c28 100644
--- a/src/main/scala/gemmini/ReservationStation.scala
+++ b/src/main/scala/gemmini/ReservationStation.scala
@@ -23,7 +23,8 @@ class ReservationStationIssue[T <: Data](cmd_t: T, id_width: Int) extends Bundle
 }
 
 // TODO we don't need to store the full command in here. We should be able to release the command directly into the relevant controller and only store the associated metadata in the ROB. This would reduce the size considerably
-class ReservationStation[T <: Data : Arithmetic, U <: Data, V <: Data](config: GemminiArrayConfig[T, U, V], cmd_t: GemminiCmd) extends Module {
+class ReservationStation[T <: Data : Arithmetic, U <: Data, V <: Data](config: GemminiArrayConfig[T, U, V],
+                                                                       cmd_t: GemminiCmd) extends Module {
   import config._
 
   val block_rows = tileRows * meshRows
@@ -251,7 +252,7 @@ class ReservationStation[T <: Data : Arithmetic, U <: Data, V <: Data](config: G
       val mvout_cols = cmd.rs2(32 + mvout_cols_bits - 1, 32)
       val mvout_rows = cmd.rs2(48 + mvout_rows_bits - 1, 48)
 
-      val mvout_mats = mvout_cols / block_cols.U + (mvout_cols % block_cols.U =/= 0.U)
+      val mvout_mats = mvout_cols / block_cols.U(mvout_cols_bits.W) + (mvout_cols % block_cols.U =/= 0.U)
       val total_mvout_rows = ((mvout_mats - 1.U) * block_stride) + mvout_rows
 
       op2.bits.end := op2.bits.start + total_mvout_rows
@@ -273,7 +274,7 @@ class ReservationStation[T <: Data : Arithmetic, U <: Data, V <: Data](config: G
       val mvin_cols = cmd.rs2(32 + mvin_cols_bits - 1, 32)
       val mvin_rows = cmd.rs2(48 + mvin_rows_bits - 1, 48)
 
-      val mvin_mats = mvin_cols / block_cols.U + (mvin_cols % block_cols.U =/= 0.U)
+      val mvin_mats = mvin_cols / block_cols.U(mvin_cols_bits.W) + (mvin_cols % block_cols.U =/= 0.U)
       val total_mvin_rows = ((mvin_mats - 1.U) * block_stride) + mvin_rows
 
       // TODO We have to know how the LoopConv's internals work here. Our abstractions are leaking
diff --git a/src/main/scala/gemmini/StoreController.scala b/src/main/scala/gemmini/StoreController.scala
index 45ec459c..c9e4fdbb 100644
--- a/src/main/scala/gemmini/StoreController.scala
+++ b/src/main/scala/gemmini/StoreController.scala
@@ -88,7 +88,7 @@ class StoreController[T <: Data : Arithmetic, U <: Data, V <: Data](config: Gemm
   val localaddr = mvout_rs2.local_addr
   val cols = mvout_rs2.num_cols
   val rows = mvout_rs2.num_rows
-  val blocks = (cols / block_cols.U) + (cols % block_cols.U =/= 0.U)
+  val blocks = (cols / block_cols.U(cols.getWidth.W)) + (cols % block_cols.U =/= 0.U)
 
   val config_mvout_rs1 = cmd.bits.cmd.rs1.asTypeOf(new ConfigMvoutRs1)
   val config_mvout_rs2 = cmd.bits.cmd.rs2.asTypeOf(new ConfigMvoutRs2(acc_scale_t_bits, 32))
diff --git a/src/main/scala/gemmini/ZeroWriter.scala b/src/main/scala/gemmini/ZeroWriter.scala
index a5c10abe..c2861f71 100644
--- a/src/main/scala/gemmini/ZeroWriter.scala
+++ b/src/main/scala/gemmini/ZeroWriter.scala
@@ -1,3 +1,4 @@
+
 package gemmini
 
 import chisel3._
@@ -40,7 +41,7 @@ class ZeroWriter[T <: Data, U <: Data, V <: Data, Tag <: Data](config: GemminiAr
   io.req.ready := !req.valid
 
   io.resp.valid := req.valid
-  io.resp.bits.laddr := req.bits.laddr + req.bits.block_stride * (col_counter / block_cols.U)
+  io.resp.bits.laddr := req.bits.laddr + req.bits.block_stride * (col_counter / block_cols.U(col_counter.getWidth.W)) // The width expansion here is added solely to satisfy Verilator's linter
   io.resp.bits.mask.zipWithIndex.foreach { case (m, i) => m := col_counter + i.U < req.bits.cols }
   io.resp.bits.last := col_counter +& block_cols.U >= req.bits.cols
   io.resp.bits.tag := req.bits.tag

From dc7ffc0ee6acbc946c1585c7c3816c475d0c93b1 Mon Sep 17 00:00:00 2001
From: Hasan Genc <hngenc@berkeley.edu>
Date: Thu, 25 Aug 2022 17:21:28 -0700
Subject: [PATCH 05/64] build gemmini on self-hosted machines in ci

---
 .github/workflows/config.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/config.yml b/.github/workflows/config.yml
index cae5f7fd..2a092e8d 100644
--- a/.github/workflows/config.yml
+++ b/.github/workflows/config.yml
@@ -51,7 +51,7 @@ jobs:
 
   prepare-gemmini-config:
     name: prepare-gemmini-config
-    runs-on: ubuntu-latest
+    runs-on: self-hosted
     needs: [prepare-build-environment, install-esp-toolchain]
     container:
       image: ucbbar/chipyard-ci-image:554b436

From 350b3196bab88b75d29c61da70487231b3f2249c Mon Sep 17 00:00:00 2001
From: Jerry Zhao <jerryz123@berkeley.edu>
Date: Tue, 22 Nov 2022 15:23:05 -0800
Subject: [PATCH 06/64] Decoupled gemmini dma width from sbus width (#256)

This enables correct functionality of gemmini when sbus width != 16 bytes.
---
 src/main/scala/gemmini/Configs.scala       |  2 --
 src/main/scala/gemmini/ConfigsFP.scala     |  5 -----
 src/main/scala/gemmini/CustomConfigs.scala |  1 -
 src/main/scala/gemmini/DSEConfigs.scala    | 12 ------------
 src/main/scala/gemmini/Scratchpad.scala    |  4 ++--
 5 files changed, 2 insertions(+), 22 deletions(-)

diff --git a/src/main/scala/gemmini/Configs.scala b/src/main/scala/gemmini/Configs.scala
index 2a060ea9..bd84b317 100644
--- a/src/main/scala/gemmini/Configs.scala
+++ b/src/main/scala/gemmini/Configs.scala
@@ -254,13 +254,11 @@ class DefaultGemminiConfig[T <: Data : Arithmetic, U <: Data, V <: Data](
       gemmini
     }
   )
-  case SystemBusKey => up(SystemBusKey).copy(beatBytes = 16)
 })
 
 // This Gemmini config has both an Int and an FP Gemmini side-by-side, sharing
 // the same scratchpad.
 class DualGemminiConfig extends Config((site, here, up) => {
-  case SystemBusKey => up(SystemBusKey).copy(beatBytes = 16)
   case BuildRoCC => {
     var int_gemmini: Gemmini[_,_,_] = null
     var fp_gemmini: Gemmini[_,_,_] = null
diff --git a/src/main/scala/gemmini/ConfigsFP.scala b/src/main/scala/gemmini/ConfigsFP.scala
index 740ece36..c76907dd 100644
--- a/src/main/scala/gemmini/ConfigsFP.scala
+++ b/src/main/scala/gemmini/ConfigsFP.scala
@@ -121,7 +121,6 @@ class GemminiFP32DefaultConfig extends Config((site, here, up) => {
         LazyModule(new Gemmini(GemminiFPConfigs.FP32DefaultConfig))
     }
   )
-  case SystemBusKey => up(SystemBusKey).copy(beatBytes = 16)
 })
 
 
@@ -134,7 +133,6 @@ class GemminiFP16DefaultConfig extends Config((site, here, up) => {
         LazyModule(new Gemmini(GemminiFPConfigs.FP16DefaultConfig))
     }
   )
-  case SystemBusKey => up(SystemBusKey).copy(beatBytes = 16)
 })
 
 //===========BFLOAT16 Default Config=========
@@ -146,7 +144,6 @@ class GemminiBF16DefaultConfig extends Config((site, here, up) => {
         LazyModule(new Gemmini(GemminiFPConfigs.BF16DefaultConfig))
     }
   )
-  case SystemBusKey => up(SystemBusKey).copy(beatBytes = 16)
 })
 
 class GemminiBF16DefaultHighPerfConfig extends Config((site, here, up) => {
@@ -161,7 +158,6 @@ class GemminiBF16DefaultHighPerfConfig extends Config((site, here, up) => {
       gemmini
     }
   )
-  case SystemBusKey => up(SystemBusKey).copy(beatBytes = 16)
 })
 
 //===========BFLOAT16 Default Config 8x8=========
@@ -173,6 +169,5 @@ class GemminiBF16Default8Config extends Config((site, here, up) => {
         LazyModule(new Gemmini(GemminiFPConfigs.BF16Default8Config))
     }
   )
-  case SystemBusKey => up(SystemBusKey).copy(beatBytes = 16)
 })
 
diff --git a/src/main/scala/gemmini/CustomConfigs.scala b/src/main/scala/gemmini/CustomConfigs.scala
index 9172e8ee..5b6e0610 100644
--- a/src/main/scala/gemmini/CustomConfigs.scala
+++ b/src/main/scala/gemmini/CustomConfigs.scala
@@ -64,5 +64,4 @@ class GemminiCustomConfig[T <: Data : Arithmetic, U <: Data, V <: Data](
       gemmini
     }
   )
-  case SystemBusKey => up(SystemBusKey).copy(beatBytes = 16)
 })
diff --git a/src/main/scala/gemmini/DSEConfigs.scala b/src/main/scala/gemmini/DSEConfigs.scala
index a34658e3..3ed92c7c 100644
--- a/src/main/scala/gemmini/DSEConfigs.scala
+++ b/src/main/scala/gemmini/DSEConfigs.scala
@@ -119,7 +119,6 @@ class GemminiParamsDSE1 extends Config((site, here, up) => {
         LazyModule(new Gemmini(DSEConfigs.baseConfig))
     }
   )
-  case SystemBusKey => up(SystemBusKey).copy(beatBytes = 16)
 })
 
 //===========DATAFLOW CHANGE: WS=========
@@ -131,7 +130,6 @@ class GemminiParamsDSE2 extends Config((site, here, up) => {
         LazyModule(new Gemmini(DSEConfigs.wsOnlyConfig))
     }
   )
-  case SystemBusKey => up(SystemBusKey).copy(beatBytes = 16)
 })
 
 //===========DATAFLOW CHANGE: BOTH=========
@@ -143,7 +141,6 @@ class GemminiParamsDSE3 extends Config((site, here, up) => {
         LazyModule(new Gemmini(DSEConfigs.bothDataflowsConfig))
     }
   )
-  case SystemBusKey => up(SystemBusKey).copy(beatBytes = 16)
 })
 
 //===========BITWIDTH CHANGE: 32 BITS=========
@@ -155,7 +152,6 @@ class GemminiParamsDSE4 extends Config((site, here, up) => {
         LazyModule(new Gemmini(DSEConfigs.highBitwidthConfig))
     }
   )
-  case SystemBusKey => up(SystemBusKey).copy(beatBytes = 16)
 })
 
 //===========DIMENSIONS CHANGE: 32x32=========
@@ -167,7 +163,6 @@ class GemminiParamsDSE5 extends Config((site, here, up) => {
         LazyModule(new Gemmini(DSEConfigs.largerDimConfig))
     }
   )
-  case SystemBusKey => up(SystemBusKey).copy(beatBytes = 16)
 })
 
 //===========PIPELINE DEPTH CHANGE: Fully Combinational=========
@@ -179,7 +174,6 @@ class GemminiParamsDSE6 extends Config((site, here, up) => {
         LazyModule(new Gemmini(DSEConfigs.fullyCombinationalConfig))
     }
   )
-  case SystemBusKey => up(SystemBusKey).copy(beatBytes = 16)
 })
 
 //===========MEMORY CAPACITY CHANGE: 256 KB=========
@@ -191,7 +185,6 @@ class GemminiParamsDSE7 extends Config((site, here, up) => {
         LazyModule(new Gemmini(DSEConfigs.moreMemoryConfig))
     }
   )
-  case SystemBusKey => up(SystemBusKey).copy(beatBytes = 16)
 })
 
 //===========MEMORY BANKS CHANGE: 33 Banks=========
@@ -203,7 +196,6 @@ class GemminiParamsDSE8 extends Config((site, here, up) => {
         LazyModule(new Gemmini(DSEConfigs.moreBanksConfig))
     }
   )
-  case SystemBusKey => up(SystemBusKey).copy(beatBytes = 16)
 })
 
 //===========BUS WIDTH CHANGE: 64 bits=========
@@ -215,7 +207,6 @@ class GemminiParamsDSE10 extends Config((site, here, up) => {
         LazyModule(new Gemmini(DSEConfigs.narrowerBusConfig))
     }
   )
-  case SystemBusKey => up(SystemBusKey).copy(beatBytes = 8)
 })
 
 //===========PnR 16-by-16=========
@@ -227,7 +218,6 @@ class GemminiParamsPnR16 extends Config((site, here, up) => {
       LazyModule(new Gemmini(DSEConfigs.pnr16Config))
     }
   )
-  case SystemBusKey => up(SystemBusKey).copy(beatBytes = 16)
 })
 
 //===========PnR 32-by-32=========
@@ -239,7 +229,6 @@ class GemminiParamsPnR32 extends Config((site, here, up) => {
       LazyModule(new Gemmini(DSEConfigs.pnr32Config))
     }
   )
-  case SystemBusKey => up(SystemBusKey).copy(beatBytes = 16)
 })
 
 //===========Scalar Processor Change=========
@@ -251,7 +240,6 @@ class GemminiParamsDSE11 extends Config((site, here, up) => {
         LazyModule(new Gemmini(DSEConfigs.baseConfig))
     }
   )
-  case SystemBusKey => up(SystemBusKey).copy(beatBytes = 16)
 })
 
 // -----------------------------
diff --git a/src/main/scala/gemmini/Scratchpad.scala b/src/main/scala/gemmini/Scratchpad.scala
index 64b66bde..70c9140f 100644
--- a/src/main/scala/gemmini/Scratchpad.scala
+++ b/src/main/scala/gemmini/Scratchpad.scala
@@ -7,7 +7,7 @@ import freechips.rocketchip.config.Parameters
 import freechips.rocketchip.diplomacy.{LazyModule, LazyModuleImp}
 import freechips.rocketchip.rocket._
 import freechips.rocketchip.tile._
-import freechips.rocketchip.tilelink.{TLIdentityNode, TLXbar, TLBuffer}
+import freechips.rocketchip.tilelink._
 
 import Util._
 
@@ -199,7 +199,7 @@ class Scratchpad[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T,
 
   xbar_node := TLBuffer() := reader.node // TODO
   xbar_node := TLBuffer() := writer.node
-  id_node := TLBuffer() := xbar_node
+  id_node := TLWidthWidget(config.dma_buswidth/8) := TLBuffer() := xbar_node
 
   lazy val module = new LazyModuleImp(this) with HasCoreParameters {
 

From e66e5c075f25419d7c8a19d810b7faf357afaed1 Mon Sep 17 00:00:00 2001
From: Abraham Gonzalez <abe.j.gonza@gmail.com>
Date: Tue, 22 Nov 2022 18:27:14 -0800
Subject: [PATCH 07/64] Rename counter file module (#247)

When testing ucb-bar/chipyard#1239, the new FIRRTL compiler errors when modules are named module. This avoids this issue.
---
 src/main/scala/gemmini/CounterFile.scala | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/src/main/scala/gemmini/CounterFile.scala b/src/main/scala/gemmini/CounterFile.scala
index 35f50c20..7b28b8e2 100644
--- a/src/main/scala/gemmini/CounterFile.scala
+++ b/src/main/scala/gemmini/CounterFile.scala
@@ -225,8 +225,8 @@ class CounterController(nPerfCounter: Int, counterWidth: Int)(implicit p: Parame
   if (nPerfCounter > 0) {
     val nCounterIndexBit = log2Ceil(nPerfCounter)
 
-    val module = Module(new CounterFile(nPerfCounter: Int, counterWidth: Int))
-    module.io.event_io <> io.event_io
+    val counterfile = Module(new CounterFile(nPerfCounter: Int, counterWidth: Int))
+    counterfile.io.event_io <> io.event_io
 
     val out_reg = Reg(io.out.bits.cloneType)
     val out_valid_reg = RegInit(false.B)
@@ -242,13 +242,13 @@ class CounterController(nPerfCounter: Int, counterWidth: Int)(implicit p: Parame
     // rs1[31] = External counter flag
 
     io.in.ready := !out_valid_reg
-    module.io.addr := io.in.bits.rs1(nCounterIndexBit + 3, 4)
-    module.io.counter_reset := io.in.bits.rs1(0) & io.in.fire
-    module.io.snapshot_reset := io.in.bits.rs1(1) & io.in.fire
-    module.io.snapshot := io.in.bits.rs1(2) & io.in.fire
-    module.io.config_address.valid := io.in.bits.rs1(3) & io.in.fire
-    module.io.config_address.bits := io.in.bits.rs1(17, 12)
-    module.io.external := io.in.bits.rs1(31)
+    counterfile.io.addr := io.in.bits.rs1(nCounterIndexBit + 3, 4)
+    counterfile.io.counter_reset := io.in.bits.rs1(0) & io.in.fire
+    counterfile.io.snapshot_reset := io.in.bits.rs1(1) & io.in.fire
+    counterfile.io.snapshot := io.in.bits.rs1(2) & io.in.fire
+    counterfile.io.config_address.valid := io.in.bits.rs1(3) & io.in.fire
+    counterfile.io.config_address.bits := io.in.bits.rs1(17, 12)
+    counterfile.io.external := io.in.bits.rs1(31)
 
     when (io.out.fire) {
       out_valid_reg := false.B
@@ -256,7 +256,7 @@ class CounterController(nPerfCounter: Int, counterWidth: Int)(implicit p: Parame
       out_valid_reg := true.B
       out_reg.rd := io.in.bits.inst.rd
       out_reg.data := 0.U
-      out_reg.data := module.io.data
+      out_reg.data := counterfile.io.data
     }
 
     io.out.valid := out_valid_reg

From ee42df52f0b557831ff3fc21de339bfee802265b Mon Sep 17 00:00:00 2001
From: Hasan Genc <hngenc@berkeley.edu>
Date: Wed, 23 Nov 2022 14:20:09 -0800
Subject: [PATCH 08/64] Merge in some changes from MLSys tutorial (#257)

Some of the updates made for the MLSys tutorial are also useful for general users. This PR merges in those changes.
---
 SPIKE.hash                                      | 2 +-
 scripts/build-onnx-inference.sh                 | 1 +
 scripts/build-onnx-training.sh                  | 2 +-
 scripts/build-vcs.sh                            | 4 ++--
 scripts/build-verilator.sh                      | 6 +++---
 scripts/run-midas.sh                            | 2 +-
 scripts/run-spike.sh                            | 2 +-
 scripts/run-vcs.sh                              | 2 +-
 scripts/run-verilator.sh                        | 2 +-
 software/gemmini-rocc-tests                     | 2 +-
 software/onnxruntime-riscv                      | 2 +-
 src/main/scala/gemmini/CustomConfigs.scala      | 3 ++-
 src/main/scala/gemmini/DMA.scala                | 3 ---
 src/main/scala/gemmini/ExecuteController.scala  | 3 ---
 src/main/scala/gemmini/MeshWithDelays.scala     | 2 --
 src/main/scala/gemmini/ReservationStation.scala | 1 -
 src/main/scala/gemmini/ZeroWriter.scala         | 9 ++++++++-
 17 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/SPIKE.hash b/SPIKE.hash
index 27baea53..8cbb8d37 100644
--- a/SPIKE.hash
+++ b/SPIKE.hash
@@ -1 +1 @@
-2ed403a70f65559a3c2a06bf724d4737edc73a23
+051d820f08be84d069993de4375d29c91eb2f577
diff --git a/scripts/build-onnx-inference.sh b/scripts/build-onnx-inference.sh
index 01d6e8ce..07999b29 100755
--- a/scripts/build-onnx-inference.sh
+++ b/scripts/build-onnx-inference.sh
@@ -5,3 +5,4 @@ rm -rf ./build/
 ./build.sh --parallel --enable_training --config=Debug --cmake_extra_defines onnxruntime_USE_SYSTOLIC=ON onnxruntime_SYSTOLIC_INT8=ON onnxruntime_SYSTOLIC_FP32=OFF
 cd ./systolic_runner/imagenet_runner/
 ./build.sh --parallel --enable_training --config=Debug
+
diff --git a/scripts/build-onnx-training.sh b/scripts/build-onnx-training.sh
index 55c9bc7b..bcb45565 100755
--- a/scripts/build-onnx-training.sh
+++ b/scripts/build-onnx-training.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-cd /root/chipyard/generators/gemmini/software/onnxruntime-riscv/
+cd ./software/onnxruntime-riscv/
 rm -rf ./build/
 ./build.sh --parallel --enable_training --config=Debug --cmake_extra_defines onnxruntime_USE_SYSTOLIC=ON onnxruntime_SYSTOLIC_INT8=OFF onnxruntime_SYSTOLIC_FP32=ON
 cd ./systolic_runner/imagenet_trainer/
diff --git a/scripts/build-vcs.sh b/scripts/build-vcs.sh
index b15a23f2..23f159b0 100755
--- a/scripts/build-vcs.sh
+++ b/scripts/build-vcs.sh
@@ -9,7 +9,7 @@ help () {
   echo "Options:"
   echo " debug   Builds a VCS simulator which generates waveforms. Without this"
   echo "         option, the simulator will not generate any waveforms."
-  echo " j [N]   Allow N jobs at once; infinite jobs with no arg."
+  echo " j [N]   Allow N jobs at once. Default is 1."
   exit
 }
 
@@ -21,7 +21,7 @@ while [ $# -gt 0 ] ; do
   case $1 in
     -h | --help) show_help=1 ;;
     --debug) debug="debug" ;;
-    -j) j=$1
+    -j) j=$2; shift
   esac
 
   shift
diff --git a/scripts/build-verilator.sh b/scripts/build-verilator.sh
index 49a25f29..477c0910 100755
--- a/scripts/build-verilator.sh
+++ b/scripts/build-verilator.sh
@@ -9,7 +9,7 @@ help () {
   echo "Options:"
   echo " debug   Builds a Verilator simulator which generates waveforms. Without"
   echo "         this option, the simulator will not generate any waveforms."
-  echo " j [N]   Allow N jobs at once; infinite jobs with no arg."
+  echo " j [N]   Allow N jobs at once. Default is 1."
   exit
 }
 
@@ -21,7 +21,7 @@ while [ $# -gt 0 ] ; do
   case $1 in
     -h | --help) show_help=1 ;;
     --debug) debug="debug" ;;
-    -j) j=$1
+    -j) j=$2; shift
   esac
 
   shift
@@ -32,5 +32,5 @@ if [ $show_help -eq 1 ]; then
 fi
 
 cd ../../sims/verilator/
-make ${debug} CONFIG=CustomGemminiSoCConfig
+make -j$j ${debug} CONFIG=CustomGemminiSoCConfig
 
diff --git a/scripts/run-midas.sh b/scripts/run-midas.sh
index 806b21b1..63616809 100755
--- a/scripts/run-midas.sh
+++ b/scripts/run-midas.sh
@@ -94,7 +94,7 @@ fi
 path=""
 suffix=""
 
-for dir in bareMetalC mlps imagenet ; do
+for dir in bareMetalC mlps imagenet transformers ; do
     if [ -f "software/gemmini-rocc-tests/build/${dir}/${binary}$default_suffix" ]; then
         path="${ROOT}/software/gemmini-rocc-tests/build/${dir}/"
         suffix=$default_suffix
diff --git a/scripts/run-spike.sh b/scripts/run-spike.sh
index 00b5349f..1638b76c 100755
--- a/scripts/run-spike.sh
+++ b/scripts/run-spike.sh
@@ -60,7 +60,7 @@ fi
 path=""
 suffix=""
 
-for dir in bareMetalC mlps imagenet ; do
+for dir in bareMetalC mlps imagenet transformers ; do
     if [ -f "software/gemmini-rocc-tests/build/${dir}/${binary}$default_suffix" ]; then
         path="software/gemmini-rocc-tests/build/${dir}/"
         suffix=$default_suffix
diff --git a/scripts/run-vcs.sh b/scripts/run-vcs.sh
index 0fcbd9b1..40ce9bda 100755
--- a/scripts/run-vcs.sh
+++ b/scripts/run-vcs.sh
@@ -73,7 +73,7 @@ fi
 path=""
 suffix=""
 
-for dir in bareMetalC mlps imagenet ; do
+for dir in bareMetalC mlps imagenet transformers ; do
     if [ -f "software/gemmini-rocc-tests/build/${dir}/${binary}$default_suffix" ]; then
         path="${ROOT}/software/gemmini-rocc-tests/build/${dir}/"
         suffix=$default_suffix
diff --git a/scripts/run-verilator.sh b/scripts/run-verilator.sh
index 58d40d2b..b4f21458 100755
--- a/scripts/run-verilator.sh
+++ b/scripts/run-verilator.sh
@@ -73,7 +73,7 @@ fi
 path=""
 suffix=""
 
-for dir in bareMetalC mlps imagenet ; do
+for dir in bareMetalC mlps imagenet transformers ; do
     if [ -f "software/gemmini-rocc-tests/build/${dir}/${binary}$default_suffix" ]; then
         path="${ROOT}/software/gemmini-rocc-tests/build/${dir}/"
         suffix=$default_suffix
diff --git a/software/gemmini-rocc-tests b/software/gemmini-rocc-tests
index 70f8e58b..b631f97c 160000
--- a/software/gemmini-rocc-tests
+++ b/software/gemmini-rocc-tests
@@ -1 +1 @@
-Subproject commit 70f8e58b8113b288f0937a4777fe582437ea36ce
+Subproject commit b631f97c371a52b2cd4fb1f4ec956bbbe86fb34a
diff --git a/software/onnxruntime-riscv b/software/onnxruntime-riscv
index 0c8c9b4f..daa8999f 160000
--- a/software/onnxruntime-riscv
+++ b/software/onnxruntime-riscv
@@ -1 +1 @@
-Subproject commit 0c8c9b4f881b5f31d32c6b5a76cac4ee14a8f338
+Subproject commit daa8999f80d5a233b6c478039f548751b7f02f38
diff --git a/src/main/scala/gemmini/CustomConfigs.scala b/src/main/scala/gemmini/CustomConfigs.scala
index 5b6e0610..ae529a69 100644
--- a/src/main/scala/gemmini/CustomConfigs.scala
+++ b/src/main/scala/gemmini/CustomConfigs.scala
@@ -41,7 +41,7 @@ object GemminiCustomConfigs {
     acc_capacity = CapacityInKilobytes(128),
   )
 
- val bertInferenceConfig = defaultConfig.copy(
+  val ibertInferenceConfig = defaultConfig.copy(
     has_training_convs = false,
     has_max_pool =  false,
     has_normalizations = true,
@@ -65,3 +65,4 @@ class GemminiCustomConfig[T <: Data : Arithmetic, U <: Data, V <: Data](
     }
   )
 })
+
diff --git a/src/main/scala/gemmini/DMA.scala b/src/main/scala/gemmini/DMA.scala
index 41164f30..71148b67 100644
--- a/src/main/scala/gemmini/DMA.scala
+++ b/src/main/scala/gemmini/DMA.scala
@@ -463,9 +463,6 @@ class StreamWriter[T <: Data: Arithmetic](nXacts: Int, beatBits: Int, maxBytes:
     }
     val write_packet = RegEnableThru(best_write_packet, state === s_writing_new_block)
 
-    for (wp <- write_packets)
-      dontTouch(wp)
-
     val write_size = write_packet.size
     val lg_write_size = write_packet.lg_size
     val write_beats = write_packet.total_beats()
diff --git a/src/main/scala/gemmini/ExecuteController.scala b/src/main/scala/gemmini/ExecuteController.scala
index 514d918e..2ef7fa3f 100644
--- a/src/main/scala/gemmini/ExecuteController.scala
+++ b/src/main/scala/gemmini/ExecuteController.scala
@@ -508,8 +508,6 @@ class ExecuteController[T <: Data, U <: Data, V <: Data](xLen: Int, tagWidth: In
     when (read_a && !io.im2col.req.ready) {
       a_ready := false.B
     }
-    dontTouch(io.im2col.req.ready)
-    dontTouch(read_a)
 
     io.im2col.req.valid := read_a
     io.im2col.req.bits.addr := a_address_rs1
@@ -994,7 +992,6 @@ class ExecuteController[T <: Data, U <: Data, V <: Data](xLen: Int, tagWidth: In
   when(io.completed.valid) {
     complete_bits_count := complete_bits_count + 1.U
   }
-  dontTouch(complete_bits_count)
 
   when (reset.asBool()) {
     // pending_completed_rob_id.valid := false.B
diff --git a/src/main/scala/gemmini/MeshWithDelays.scala b/src/main/scala/gemmini/MeshWithDelays.scala
index f6cf7517..d0aced16 100644
--- a/src/main/scala/gemmini/MeshWithDelays.scala
+++ b/src/main/scala/gemmini/MeshWithDelays.scala
@@ -232,8 +232,6 @@ class MeshWithDelays[T <: Data: Arithmetic, U <: TagQueueTag with Data]
   val out_matmul_id = WireInit(shifted(mesh.io.out_id, outBanks, reverse = true)(0)(0))
   io.resp.bits.tag := Mux(tagq.io.deq.valid && out_matmul_id === tagq.io.deq.bits.id, tagq.io.deq.bits.tag, tag_garbage)
 
-  dontTouch(out_matmul_id)
-
   tagq.io.deq.ready := io.resp.valid && io.resp.bits.last && out_matmul_id === tagq.io.deq.bits.id
 
   val total_rows_q = Module(new Queue(new TagWithIdAndTotalRows, tagqlen))
diff --git a/src/main/scala/gemmini/ReservationStation.scala b/src/main/scala/gemmini/ReservationStation.scala
index cfd60c28..68d0e6e7 100644
--- a/src/main/scala/gemmini/ReservationStation.scala
+++ b/src/main/scala/gemmini/ReservationStation.scala
@@ -180,7 +180,6 @@ class ReservationStation[T <: Data : Arithmetic, U <: Data, V <: Data](config: G
 
   val alloc_fire = io.alloc.fire()
 
-  dontTouch(new_entry)
   io.alloc.ready := false.B
   when (io.alloc.valid) {
     val spAddrBits = 32
diff --git a/src/main/scala/gemmini/ZeroWriter.scala b/src/main/scala/gemmini/ZeroWriter.scala
index c2861f71..a1834a41 100644
--- a/src/main/scala/gemmini/ZeroWriter.scala
+++ b/src/main/scala/gemmini/ZeroWriter.scala
@@ -41,7 +41,14 @@ class ZeroWriter[T <: Data, U <: Data, V <: Data, Tag <: Data](config: GemminiAr
   io.req.ready := !req.valid
 
   io.resp.valid := req.valid
-  io.resp.bits.laddr := req.bits.laddr + req.bits.block_stride * (col_counter / block_cols.U(col_counter.getWidth.W)) // The width expansion here is added solely to satisfy Verilator's linter
+  io.resp.bits.laddr := req.bits.laddr + req.bits.block_stride * {
+    // This code block was originally just "col_counter / block_cols.U". We
+    // changed it to satisfy Verilator's linter
+    if (col_counter.getWidth >= log2Ceil(block_cols+1))
+      (col_counter / block_cols.U(col_counter.getWidth.W))
+    else
+      0.U
+  }
   io.resp.bits.mask.zipWithIndex.foreach { case (m, i) => m := col_counter + i.U < req.bits.cols }
   io.resp.bits.last := col_counter +& block_cols.U >= req.bits.cols
   io.resp.bits.tag := req.bits.tag

From d2922c605fac2ec580e394081944ac4f21ad7924 Mon Sep 17 00:00:00 2001
From: Hasan Genc <hngenc@berkeley.edu>
Date: Wed, 23 Nov 2022 14:24:35 -0800
Subject: [PATCH 09/64] bump CHIPYARD.hash to Chipyard 1.8.1

---
 CHIPYARD.hash | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHIPYARD.hash b/CHIPYARD.hash
index b154a058..7fb91902 100644
--- a/CHIPYARD.hash
+++ b/CHIPYARD.hash
@@ -1 +1 @@
-481398b910fa95ec88dd578c67ba358a4d83129d
+004297b6a8c01be1b2110c4cf4f9393ae1ff8805

From 65bd41d034cc1553ba6d28f2865ea89e710b4d6c Mon Sep 17 00:00:00 2001
From: Hasan Genc <hngenc@berkeley.edu>
Date: Wed, 23 Nov 2022 15:39:26 -0800
Subject: [PATCH 10/64] fixes for chipyard 1.8.1

---
 .github/scripts/build-toolchains.sh           | 2 +-
 scripts/build-midas.sh                        | 2 +-
 src/main/scala/gemmini/CustomSoCConfigs.scala | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/scripts/build-toolchains.sh b/.github/scripts/build-toolchains.sh
index fa6017ea..b23d854e 100755
--- a/.github/scripts/build-toolchains.sh
+++ b/.github/scripts/build-toolchains.sh
@@ -30,6 +30,6 @@ if [ ! -d "$INSTALL_DIR" ]; then
     cd $HOME
 
     # init all submodules including the tools (doesn't use CI_MAKE_PROC due to mem. constraints)
-    CHIPYARD_DIR="$LOCAL_CHIPYARD_DIR" NPROC=$CI_MAKE_NPROC $LOCAL_CHIPYARD_DIR/scripts/build-toolchains.sh esp-tools
+    CHIPYARD_DIR="$LOCAL_CHIPYARD_DIR" NPROC=$CI_MAKE_NPROC $LOCAL_CHIPYARD_DIR/build-setup.sh --skip-conda esp-tools
 fi
 
diff --git a/scripts/build-midas.sh b/scripts/build-midas.sh
index c966513c..ba624087 100755
--- a/scripts/build-midas.sh
+++ b/scripts/build-midas.sh
@@ -54,7 +54,7 @@ if [ dram_model == "" ]; then
 fi
 
 cd ../../sims/firesim/
-source sourceme-f1-manager.sh &> build.log
+source sourceme-f1-manager.sh --skip-ssh-setup &> build.log
 
 cd sim/
 make ${simulator}${debug} TARGET_CONFIG=${dram_model}_WithDefaultFireSimBridges_WithFireSimConfigTweaks_chipyard.CustomGemminiSoCConfig
diff --git a/src/main/scala/gemmini/CustomSoCConfigs.scala b/src/main/scala/gemmini/CustomSoCConfigs.scala
index aebfb520..057aa1e1 100644
--- a/src/main/scala/gemmini/CustomSoCConfigs.scala
+++ b/src/main/scala/gemmini/CustomSoCConfigs.scala
@@ -10,10 +10,10 @@ class CustomGemminiSoCConfig extends Config(
   new chipyard.config.WithL2TLBs(512) ++
 
   new freechips.rocketchip.subsystem.WithInclusiveCache(
-    nBanks = 1,
     nWays = 8,
     capacityKB = 512,
-    outerLatencyCycles = 40
+    outerLatencyCycles = 40,
+    subBankingFactor = 4
   ) ++
 
   // Set the number of CPUs you want to create

From 6bdf36d59234f377829468b97a712b854110e66d Mon Sep 17 00:00:00 2001
From: Hasan Genc <hngenc@berkeley.edu>
Date: Sat, 26 Nov 2022 09:39:26 -0800
Subject: [PATCH 11/64] updates to ci

---
 .github/scripts/build-toolchains.sh      | 35 ---------
 .github/scripts/defaults.sh              |  2 +
 .github/scripts/do-rtl-build.sh          | 31 +++++---
 .github/scripts/enable-conda.sh          | 13 ++++
 .github/scripts/install-gemmini.sh       | 42 +++++++++++
 .github/scripts/install-verilator.sh     | 20 -----
 .github/scripts/prepare-for-rtl-build.sh | 23 ------
 .github/scripts/remove-chipyard.sh       | 10 +++
 .github/scripts/run-tests-rtl.sh         |  6 +-
 .github/scripts/run-tests-spike.sh       | 24 +++---
 .github/workflows/config.yml             | 96 +++++++++---------------
 11 files changed, 138 insertions(+), 164 deletions(-)
 delete mode 100755 .github/scripts/build-toolchains.sh
 create mode 100644 .github/scripts/enable-conda.sh
 create mode 100755 .github/scripts/install-gemmini.sh
 delete mode 100755 .github/scripts/install-verilator.sh
 delete mode 100755 .github/scripts/prepare-for-rtl-build.sh
 create mode 100755 .github/scripts/remove-chipyard.sh

diff --git a/.github/scripts/build-toolchains.sh b/.github/scripts/build-toolchains.sh
deleted file mode 100755
index b23d854e..00000000
--- a/.github/scripts/build-toolchains.sh
+++ /dev/null
@@ -1,35 +0,0 @@
-#!/bin/bash
-
-#-------------------------------------------------------------
-# create the riscv tools binaries from ucb-bar/chipyard with rocket-chip hash given by riscv-boom
-#
-# run location: circle ci docker image
-# usage:
-#   $1 - name of the toolchain to build
-#-------------------------------------------------------------
-
-# turn echo on and error on earliest command
-set -ex
-
-# get shared variables
-SCRIPT_DIR="$( cd "$( dirname "$0" )" && pwd )"
-source $SCRIPT_DIR/defaults.sh
-
-INSTALL_DIR="$HOME/$1-install"
-
-if [ ! -d "$INSTALL_DIR" ]; then
-    cd $HOME
-
-    git clone --progress --verbose https://github.com/ucb-bar/chipyard.git chipyard
-    cd $LOCAL_CHIPYARD_DIR
-
-    echo "Checking out Chipyard version: $(cat $LOCAL_CHECKOUT_DIR/CHIPYARD.hash)"
-    git fetch
-    git checkout $(cat $LOCAL_CHECKOUT_DIR/CHIPYARD.hash)
-
-    cd $HOME
-
-    # init all submodules including the tools (doesn't use CI_MAKE_PROC due to mem. constraints)
-    CHIPYARD_DIR="$LOCAL_CHIPYARD_DIR" NPROC=$CI_MAKE_NPROC $LOCAL_CHIPYARD_DIR/build-setup.sh --skip-conda esp-tools
-fi
-
diff --git a/.github/scripts/defaults.sh b/.github/scripts/defaults.sh
index 6a02a220..e403fc89 100755
--- a/.github/scripts/defaults.sh
+++ b/.github/scripts/defaults.sh
@@ -28,6 +28,7 @@ LOCAL_ESP_DIR=$HOME/esp-tools-install
 LOCAL_CHIPYARD_DIR=$HOME/chipyard
 LOCAL_SIM_DIR=$LOCAL_CHIPYARD_DIR/sims/verilator
 LOCAL_VERILATOR_DIR=$HOME/verilator-install
+LOCAL_CONDA=/opt/conda/
 
 echo "::set-output name=LOCAL_WORK_DIR::$LOCAL_WORK_DIR"
 echo "::set-output name=LOCAL_CHECKOUT_DIR::$LOCAL_CHECKOUT_DIR"
@@ -36,3 +37,4 @@ echo "::set-output name=LOCAL_ESP_DIR::$LOCAL_ESP_DIR"
 echo "::set-output name=LOCAL_CHIPYARD_DIR::$LOCAL_CHIPYARD_DIR"
 echo "::set-output name=LOCAL_SIM_DIR::$LOCAL_SIM_DIR"
 echo "::set-output name=LOCAL_VERILATOR_DIR::$LOCAL_VERILATOR_DIR"
+echo "::set-output name=LOCAL_CONDA::$LOCAL_CONDA"
diff --git a/.github/scripts/do-rtl-build.sh b/.github/scripts/do-rtl-build.sh
index 1b93655e..102aaa60 100755
--- a/.github/scripts/do-rtl-build.sh
+++ b/.github/scripts/do-rtl-build.sh
@@ -7,20 +7,27 @@ set -ex
 SCRIPT_DIR="$( cd "$( dirname "$0" )" && pwd )"
 source $SCRIPT_DIR/defaults.sh
 
-rm -rf $LOCAL_CHIPYARD_DIR/generators/gemmini/*
-cd $LOCAL_CHECKOUT_DIR
-git submodule update --init --recursive software/gemmini-rocc-tests
-mv -f $LOCAL_CHECKOUT_DIR/* $LOCAL_CHIPYARD_DIR/generators/gemmini/
+source $SCRIPT_DIR/enable-conda.sh
 
+cd $LOCAL_CHIPYARD_DIR
+source env.sh
 
-TOOLS_DIR=$LOCAL_ESP_DIR
-LD_LIB_DIR=$LOCAL_ESP_DIR/lib
+cd $LOCAL_CHIPYARD_DIR
+echo Printing current chipyard commit
+git log -1 --format="%H"
 
-# enter the verilator directory and build the specific config on remote server
+cd $LOCAL_CHIPYARD_DIR/generators/gemmini
+echo Printing current gemmini commit
+git log -1 --format="%H"
+
+cd $LOCAL_CHIPYARD_DIR/generators/rocket-chip
+echo Printing rocket-chip commit
+git log -1 --format="%H"
+
+echo Printing rocket-chip sources
+ls src/main/scala/
+
+cd $LOCAL_SIM_DIR
 make -C $LOCAL_SIM_DIR clean
-export RISCV=$TOOLS_DIR
-export LD_LIBRARY_PATH=$LD_LIB_DIR
-export PATH=$LOCAL_VERILATOR_DIR/bin:$PATH
-export VERILATOR_ROOT=$LOCAL_VERILATOR_DIR
-export COURSIER_CACHE=$LOCAL_WORK_DIR/.coursier-cache
 make -j$LOCAL_MAKE_NPROC -C $LOCAL_SIM_DIR VERILATOR_OPT_FLAGS="-O0 -OG" JAVA_OPTS="-Xmx2500M -Xss8M" SBT_OPTS="-Dsbt.ivy.home=$LOCAL_CHIPYARD_DIR/.ivy2 -Dsbt.supershell=false -Dsbt.global.base=$LOCAL_CHIPYARD_DIR/.sbt -Dsbt.boot.directory=$LOCAL_CHIPYARD_DIR/.sbt/boot" CONFIG=GemminiRocketConfig
+
diff --git a/.github/scripts/enable-conda.sh b/.github/scripts/enable-conda.sh
new file mode 100644
index 00000000..184ead9b
--- /dev/null
+++ b/.github/scripts/enable-conda.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+export PATH="$LOCAL_CONDA/bin:$PATH"
+conda init
+source ~/.bashrc
+conda activate base
+if ! { conda env list | grep 'chipyard'; } >/dev/null 2>&1; then
+    conda create -n chipyard
+    conda activate chipyard
+    conda install -c conda-forge conda-lock
+fi
+conda activate chipyard
+
diff --git a/.github/scripts/install-gemmini.sh b/.github/scripts/install-gemmini.sh
new file mode 100755
index 00000000..8a2ef5cd
--- /dev/null
+++ b/.github/scripts/install-gemmini.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+
+#-------------------------------------------------------------
+# installs gemmini
+#
+# run location: circle ci docker image
+#-------------------------------------------------------------
+
+# turn echo on and error on earliest command
+set -ex
+
+# get shared variables
+SCRIPT_DIR="$( cd "$( dirname "$0" )" && pwd )"
+source $SCRIPT_DIR/defaults.sh
+
+source $SCRIPT_DIR/enable-conda.sh
+
+cd $HOME
+rm -rf chipyard
+git clone --progress --verbose https://github.com/ucb-bar/chipyard.git chipyard
+cd $LOCAL_CHIPYARD_DIR
+
+echo "Checking out Chipyard version: $(cat $LOCAL_CHECKOUT_DIR/CHIPYARD.hash)"
+git fetch
+git checkout $(cat $LOCAL_CHECKOUT_DIR/CHIPYARD.hash)
+
+./build-setup.sh esp-tools
+
+source env.sh
+
+cd toolchains/esp-tools/riscv-isa-sim/build
+echo "Checking out Spike version $(cat $LOCAL_CHECKOUT_DIR/SPIKE.hash)"
+git checkout $(cat $LOCAL_CHECKOUT_DIR/SPIKE.hash)
+make && make install
+
+cd $LOCAL_CHECKOUT_DIR
+chown -R $(whoami) .
+git config --global --add safe.directory $LOCAL_CHECKOUT_DIR
+git submodule update --init --recursive software/gemmini-rocc-tests
+rm -rf $LOCAL_CHIPYARD_DIR/generators/gemmini/* $LOCAL_CHIPYARD_DIR/generators/gemmini/.git*
+mv -f $LOCAL_CHECKOUT_DIR/* $LOCAL_CHECKOUT_DIR/.git* $LOCAL_CHIPYARD_DIR/generators/gemmini/
+
diff --git a/.github/scripts/install-verilator.sh b/.github/scripts/install-verilator.sh
deleted file mode 100755
index b996b4d0..00000000
--- a/.github/scripts/install-verilator.sh
+++ /dev/null
@@ -1,20 +0,0 @@
-#!/bin/bash
-
-# move verilator to the remote server
-
-# turn echo on and error on earliest command
-set -ex
-
-# get shared variables
-SCRIPT_DIR="$( cd "$( dirname "$0" )" && pwd )"
-source $SCRIPT_DIR/defaults.sh
-
-if [ ! -d "$LOCAL_VERILATOR_DIR" ]; then
-    git clone http://git.veripool.org/git/verilator $LOCAL_VERILATOR_DIR
-    cd $LOCAL_VERILATOR_DIR
-    git checkout $VERILATOR_VERSION
-    autoconf
-    export VERILATOR_ROOT=$LOCAL_VERILATOR_DIR
-    ./configure
-    make -j$LOCAL_MAKE_NPROC
-fi
diff --git a/.github/scripts/prepare-for-rtl-build.sh b/.github/scripts/prepare-for-rtl-build.sh
deleted file mode 100755
index df3ac470..00000000
--- a/.github/scripts/prepare-for-rtl-build.sh
+++ /dev/null
@@ -1,23 +0,0 @@
-#!/bin/bash
-
-# turn echo on and error on earliest command
-set -ex
-
-# get shared variables
-SCRIPT_DIR="$( cd "$( dirname "$0" )" && pwd )"
-source $SCRIPT_DIR/defaults.sh
-
-# check to see if both dirs exist
-if [ ! -d "$LOCAL_CHIPYARD_DIR" ]; then
-    cd $HOME
-
-    git clone --progress --verbose https://github.com/ucb-bar/chipyard.git chipyard
-    cd $LOCAL_CHIPYARD_DIR
-
-    echo "Checking out Chipyard version: $(cat $LOCAL_CHECKOUT_DIR/CHIPYARD.hash)"
-    git fetch
-    git checkout $(cat $LOCAL_CHECKOUT_DIR/CHIPYARD.hash)
-
-    # init all submodules (according to what chipyard wants)
-    ./scripts/init-submodules-no-riscv-tools.sh
-fi
diff --git a/.github/scripts/remove-chipyard.sh b/.github/scripts/remove-chipyard.sh
new file mode 100755
index 00000000..8b82019e
--- /dev/null
+++ b/.github/scripts/remove-chipyard.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+set -ex
+
+SCRIPT_DIR="$( cd "$( dirname "$0" )" && pwd )"
+source $SCRIPT_DIR/defaults.sh
+
+rm -rf $LOCAL_CHIPYARD_DIR
+rm -rf $LOCAL_CONDA
+
diff --git a/.github/scripts/run-tests-rtl.sh b/.github/scripts/run-tests-rtl.sh
index c5907ddd..47a87ff1 100755
--- a/.github/scripts/run-tests-rtl.sh
+++ b/.github/scripts/run-tests-rtl.sh
@@ -5,9 +5,10 @@ set -ex
 SCRIPT_DIR="$( cd "$( dirname "$0" )" && pwd )"
 source $SCRIPT_DIR/defaults.sh
 
+source $SCRIPT_DIR/enable-conda.sh
 
-TOOLS_DIR=$LOCAL_ESP_DIR
-PATH=$PATH:$LOCAL_ESP_DIR/bin
+cd $LOCAL_CHIPYARD_DIR
+source env.sh
 
 cd $LOCAL_CHIPYARD_DIR/generators/gemmini/software/gemmini-rocc-tests
 CFLAGS=-DFAST ./build.sh
@@ -15,4 +16,3 @@ CFLAGS=-DFAST ./build.sh
 cd build
 make test-baremetal-bareMetalC RUNNER="'make -C $LOCAL_CHIPYARD_DIR/sims/verilator/ CONFIG=GemminiRocketConfig run-binary-hex BINARY='"
 
-
diff --git a/.github/scripts/run-tests-spike.sh b/.github/scripts/run-tests-spike.sh
index 9f933aaf..c6dbf850 100755
--- a/.github/scripts/run-tests-spike.sh
+++ b/.github/scripts/run-tests-spike.sh
@@ -5,23 +5,25 @@ set -ex
 SCRIPT_DIR="$( cd "$( dirname "$0" )" && pwd )"
 source $SCRIPT_DIR/defaults.sh
 
+source $SCRIPT_DIR/enable-conda.sh
 
-# clone and build our version of spike
-TOOLS_DIR=$LOCAL_ESP_DIR
-PATH=$PATH:$LOCAL_ESP_DIR/bin
+cd $LOCAL_CHIPYARD_DIR
+source env.sh
 
-git clone https://github.com/ucb-bar/esp-isa-sim.git
-cd esp-isa-sim
-git checkout $(cat $LOCAL_CHECKOUT_DIR/SPIKE.hash)
-cp $LOCAL_CHIPYARD_DIR/generators/gemmini/software/gemmini-rocc-tests/include/gemmini_params.h gemmini/
+cd $LOCAL_CHIPYARD_DIR/toolchains/esp-tools/riscv-isa-sim
+echo Printing current spike commit
+git log -1 --format="%H"
 
-mkdir build
-cd build
-../configure --prefix=$TOOLS_DIR
-make -j8 install
+cd $LOCAL_CHIPYARD_DIR/generators/gemmini
+echo Printing current gemmini commit
+git log -1 --format="%H"
 
 cd $LOCAL_CHIPYARD_DIR/generators/gemmini/software/gemmini-rocc-tests
+echo Printing current gemmini-rocc-tests commit
+git log -1 --format="%H"
+
 ./build.sh
 
 cd build
 make test-baremetal
+
diff --git a/.github/workflows/config.yml b/.github/workflows/config.yml
index 2a092e8d..f4a2dfda 100644
--- a/.github/workflows/config.yml
+++ b/.github/workflows/config.yml
@@ -1,11 +1,11 @@
 name: Gemmini CI
 on: [push]
 jobs:
-  install-esp-toolchain:
-    name: install-esp-toolchain
+  install-gemmini:
+    name: gemmini-install
     runs-on: ubuntu-latest
     container:
-      image: ucbbar/chipyard-ci-image:554b436
+      image: ucbbar/chipyard-ci-image:3f9150
       options: --entrypoint /bin/bash
     steps:
       - name: checkout
@@ -14,47 +14,23 @@ jobs:
         run: .github/scripts/defaults.sh
         id: get-paths
 
-      - name: toolchain-build
-        run: .github/scripts/build-toolchains.sh esp-tools
+      - name: install gemmini
+        run: .github/scripts/install-gemmini.sh
 
-      - name: cache esp-toolchain install
-        uses: actions/cache@v2
-        with:
-          path: ${{ steps.get-paths.outputs.LOCAL_ESP_DIR }}
-          key: esp-tools-install-${{ github.ref }}-${{ github.sha }}
-
-  prepare-build-environment:
-    name: prepare-build-environment
-    runs-on: ubuntu-latest
-    container:
-      image: ucbbar/chipyard-ci-image:554b436
-      options: --entrypoint /bin/bash
-    steps:
-      - name: checkout
-        uses: actions/checkout@v2
-      - name: get paths
-        run: .github/scripts/defaults.sh
-        id: get-paths
-
-      - name: setup build environment
-        run: .github/scripts/prepare-for-rtl-build.sh
-      - name: install verilator
-        run: .github/scripts/install-verilator.sh
-
-      - name: cache prepare-build-environment install
+      - name: cache gemmini install
         uses: actions/cache@v2
         with:
           path: |
             ${{ steps.get-paths.outputs.LOCAL_CHIPYARD_DIR }}
-            ${{ steps.get-paths.outputs.LOCAL_VERILATOR_DIR }}
-          key: prepare-build-environment-${{ github.ref }}-${{ github.sha }}
+            ${{ steps.get-paths.outputs.LOCAL_CONDA }}
+          key: gemmini-install-${{ github.ref }}-${{ github.sha }}
 
-  prepare-gemmini-config:
-    name: prepare-gemmini-config
+  build-gemmini-config:
+    name: build-gemmini-config
     runs-on: self-hosted
-    needs: [prepare-build-environment, install-esp-toolchain]
+    needs: install-gemmini
     container:
-      image: ucbbar/chipyard-ci-image:554b436
+      image: ucbbar/chipyard-ci-image:3f9150
       options: --entrypoint /bin/bash
     steps:
       - name: checkout
@@ -63,38 +39,34 @@ jobs:
         run: .github/scripts/defaults.sh
         id: get-paths
 
-      - name: restore cache esp-toolchain install
-        uses: actions/cache@v2
-        with:
-          path: ${{ steps.get-paths.outputs.LOCAL_ESP_DIR }}
-          key: esp-tools-install-${{ github.ref }}-${{ github.sha }}
+      - name: remove chipyard
+        run: .github/scripts/remove-chipyard.sh
 
-      - name: restore cache prepare-build-environment install
+      - name: restore cache gemmini install
         uses: actions/cache@v2
         with:
           path: |
             ${{ steps.get-paths.outputs.LOCAL_CHIPYARD_DIR }}
-            ${{ steps.get-paths.outputs.LOCAL_VERILATOR_DIR }}
-          key: prepare-build-environment-${{ github.ref }}-${{ github.sha }}
+            ${{ steps.get-paths.outputs.LOCAL_CONDA }}
+          key: gemmini-install-${{ github.ref }}-${{ github.sha }}
 
       - name: Building Gemmini Config using Verilator
         run: .github/scripts/do-rtl-build.sh
 
-      - name: cache prepare-gemmini-config install
+      - name: cache build-gemmini-config install
         uses: actions/cache@v2
         with:
           path: |
             ${{ steps.get-paths.outputs.LOCAL_CHIPYARD_DIR }}
-            ${{ steps.get-paths.outputs.LOCAL_VERILATOR_DIR }}
-            ${{ steps.get-paths.outputs.LOCAL_ESP_DIR }}
-          key: prepare-gemmini-config-${{ github.ref }}-${{ github.sha }}
+            ${{ steps.get-paths.outputs.LOCAL_CONDA }}
+          key: build-gemmini-config-${{ github.ref }}-${{ github.sha }}
 
   spike-run-tests:
     name: spike-run-tests
     runs-on: ubuntu-latest
-    needs: prepare-gemmini-config
+    needs: install-gemmini
     container:
-      image: ucbbar/chipyard-ci-image:554b436
+      image: ucbbar/chipyard-ci-image:3f9150
       options: --entrypoint /bin/bash
     steps:
       - name: checkout
@@ -103,14 +75,16 @@ jobs:
         run: .github/scripts/defaults.sh
         id: get-paths
 
-      - name: restore cache prepare-gemmini-config install
+      - name: remove chipyard
+        run: .github/scripts/remove-chipyard.sh
+
+      - name: restore cache gemmini install
         uses: actions/cache@v2
         with:
           path: |
             ${{ steps.get-paths.outputs.LOCAL_CHIPYARD_DIR }}
-            ${{ steps.get-paths.outputs.LOCAL_VERILATOR_DIR }}
-            ${{ steps.get-paths.outputs.LOCAL_ESP_DIR }}
-          key: prepare-gemmini-config-${{ github.ref }}-${{ github.sha }}
+            ${{ steps.get-paths.outputs.LOCAL_CONDA }}
+          key: gemmini-install-${{ github.ref }}-${{ github.sha }}
 
       - name: run-tests
         run: .github/scripts/run-tests-spike.sh
@@ -118,9 +92,9 @@ jobs:
   rtl-run-tests:
     name: rtl-run-tests
     runs-on: ubuntu-latest
-    needs: prepare-gemmini-config
+    needs: build-gemmini-config
     container:
-      image: ucbbar/chipyard-ci-image:554b436
+      image: ucbbar/chipyard-ci-image:3f9150
       options: --entrypoint /bin/bash
     steps:
       - name: checkout
@@ -129,14 +103,16 @@ jobs:
         run: .github/scripts/defaults.sh
         id: get-paths
 
-      - name: restore cache prepare-gemmini-config install
+      - name: remove chipyard
+        run: .github/scripts/remove-chipyard.sh
+
+      - name: restore cache build-gemmini-config install
         uses: actions/cache@v2
         with:
           path: |
             ${{ steps.get-paths.outputs.LOCAL_CHIPYARD_DIR }}
-            ${{ steps.get-paths.outputs.LOCAL_VERILATOR_DIR }}
-            ${{ steps.get-paths.outputs.LOCAL_ESP_DIR }}
-          key: prepare-gemmini-config-${{ github.ref }}-${{ github.sha }}
+            ${{ steps.get-paths.outputs.LOCAL_CONDA }}
+          key: build-gemmini-config-${{ github.ref }}-${{ github.sha }}
 
       - name: run-tests
         run: .github/scripts/run-tests-rtl.sh

From c1ccf7dc6ed2636ca0eff836880d9d2ef481b79f Mon Sep 17 00:00:00 2001
From: Hasan Genc <hngenc@berkeley.edu>
Date: Sat, 26 Nov 2022 09:43:31 -0800
Subject: [PATCH 12/64] bump gemmini-rocc-tests

---
 software/gemmini-rocc-tests | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/software/gemmini-rocc-tests b/software/gemmini-rocc-tests
index b631f97c..ae0cd823 160000
--- a/software/gemmini-rocc-tests
+++ b/software/gemmini-rocc-tests
@@ -1 +1 @@
-Subproject commit b631f97c371a52b2cd4fb1f4ec956bbbe86fb34a
+Subproject commit ae0cd8236d32fccf7197a7ac0634df5513cec4db

From beb3ee5f2246a153034be161e278b588bf4833a2 Mon Sep 17 00:00:00 2001
From: Hasan Genc <hngenc@berkeley.edu>
Date: Sat, 26 Nov 2022 09:47:03 -0800
Subject: [PATCH 13/64] remove unnecessary echos from ci

---
 .github/scripts/do-rtl-build.sh    | 15 ---------------
 .github/scripts/install-gemmini.sh |  2 --
 .github/scripts/run-tests-spike.sh | 11 -----------
 3 files changed, 28 deletions(-)

diff --git a/.github/scripts/do-rtl-build.sh b/.github/scripts/do-rtl-build.sh
index 102aaa60..38651571 100755
--- a/.github/scripts/do-rtl-build.sh
+++ b/.github/scripts/do-rtl-build.sh
@@ -12,21 +12,6 @@ source $SCRIPT_DIR/enable-conda.sh
 cd $LOCAL_CHIPYARD_DIR
 source env.sh
 
-cd $LOCAL_CHIPYARD_DIR
-echo Printing current chipyard commit
-git log -1 --format="%H"
-
-cd $LOCAL_CHIPYARD_DIR/generators/gemmini
-echo Printing current gemmini commit
-git log -1 --format="%H"
-
-cd $LOCAL_CHIPYARD_DIR/generators/rocket-chip
-echo Printing rocket-chip commit
-git log -1 --format="%H"
-
-echo Printing rocket-chip sources
-ls src/main/scala/
-
 cd $LOCAL_SIM_DIR
 make -C $LOCAL_SIM_DIR clean
 make -j$LOCAL_MAKE_NPROC -C $LOCAL_SIM_DIR VERILATOR_OPT_FLAGS="-O0 -OG" JAVA_OPTS="-Xmx2500M -Xss8M" SBT_OPTS="-Dsbt.ivy.home=$LOCAL_CHIPYARD_DIR/.ivy2 -Dsbt.supershell=false -Dsbt.global.base=$LOCAL_CHIPYARD_DIR/.sbt -Dsbt.boot.directory=$LOCAL_CHIPYARD_DIR/.sbt/boot" CONFIG=GemminiRocketConfig
diff --git a/.github/scripts/install-gemmini.sh b/.github/scripts/install-gemmini.sh
index 8a2ef5cd..0fa6460d 100755
--- a/.github/scripts/install-gemmini.sh
+++ b/.github/scripts/install-gemmini.sh
@@ -20,7 +20,6 @@ rm -rf chipyard
 git clone --progress --verbose https://github.com/ucb-bar/chipyard.git chipyard
 cd $LOCAL_CHIPYARD_DIR
 
-echo "Checking out Chipyard version: $(cat $LOCAL_CHECKOUT_DIR/CHIPYARD.hash)"
 git fetch
 git checkout $(cat $LOCAL_CHECKOUT_DIR/CHIPYARD.hash)
 
@@ -29,7 +28,6 @@ git checkout $(cat $LOCAL_CHECKOUT_DIR/CHIPYARD.hash)
 source env.sh
 
 cd toolchains/esp-tools/riscv-isa-sim/build
-echo "Checking out Spike version $(cat $LOCAL_CHECKOUT_DIR/SPIKE.hash)"
 git checkout $(cat $LOCAL_CHECKOUT_DIR/SPIKE.hash)
 make && make install
 
diff --git a/.github/scripts/run-tests-spike.sh b/.github/scripts/run-tests-spike.sh
index c6dbf850..93288a75 100755
--- a/.github/scripts/run-tests-spike.sh
+++ b/.github/scripts/run-tests-spike.sh
@@ -10,18 +10,7 @@ source $SCRIPT_DIR/enable-conda.sh
 cd $LOCAL_CHIPYARD_DIR
 source env.sh
 
-cd $LOCAL_CHIPYARD_DIR/toolchains/esp-tools/riscv-isa-sim
-echo Printing current spike commit
-git log -1 --format="%H"
-
-cd $LOCAL_CHIPYARD_DIR/generators/gemmini
-echo Printing current gemmini commit
-git log -1 --format="%H"
-
 cd $LOCAL_CHIPYARD_DIR/generators/gemmini/software/gemmini-rocc-tests
-echo Printing current gemmini-rocc-tests commit
-git log -1 --format="%H"
-
 ./build.sh
 
 cd build

From ee5746cc1875c74e5841cb0fc904ce5a3ec22049 Mon Sep 17 00:00:00 2001
From: Hasan Genc <hngenc@berkeley.edu>
Date: Sat, 26 Nov 2022 12:18:34 -0800
Subject: [PATCH 14/64] update readme

---
 README.md | 25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index 77d6466d..1e4f78bc 100644
--- a/README.md
+++ b/README.md
@@ -32,27 +32,28 @@ Run these steps to install Chipyard and Spike (make sure to checkout the correct
 ```shell
 git clone https://github.com/ucb-bar/chipyard.git
 cd chipyard
-git checkout 481398b910fa95ec88dd578c67ba358a4d83129d
-./scripts/init-submodules-no-riscv-tools.sh
-./scripts/build-toolchains.sh esp-tools
+git checkout 1.8.1
+./build-setup.sh esp-tools
 
 source env.sh
 
 cd generators/gemmini
 git config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*"
-git fetch --unshallow && git checkout dev && git pull origin dev
-git submodule update
+git checkout dev && git pull origin dev
+git submodule update --init --recursive
+
+SPIKE_HASH=$(cat SPIKE.hash)
 
 cd -
 cd toolchains/esp-tools/riscv-isa-sim/build
-git fetch --unshallow && git checkout 2ed403a70f65559a3c2a06bf724d4737edc73a23
+git checkout $SPIKE_HASH
 make && make install
 
 # The final step is only necessary if you want to run MIDAS simulations with
 # realistic DRAM models
 cd -
 cd sims/firesim
-git fetch --tags && git checkout 1.13.6
+source sourceme-f1-manager.sh --skip-ssh-setup # Ignore error messages from this command
 ./build-setup.sh --library --skip-validate
 ```
 
@@ -466,7 +467,7 @@ When calling `config_mvin` (described below), the programmer can choose which `m
 **Format:** `config_ex rs1 rs2`
 - `rs1[1:0]` must be `00`
 - `rs1[2]` determines if output (0) or weight (1) stationary
-- `rs1[4:3]` = activation function: either relu (1), relu6 (2), or no activation function (0)
+- `rs1[3]` = activation function: either relu (1) or no activation function (0)
 - `rs1[8]` = should A be transposed?
 - `rs1[9]` = should B be transposed?
 - `rs1[31:16]` = the stride (in scratchpad addresses) by which the rows of A are fed into the systolic array.
@@ -477,8 +478,6 @@ If the stride is 2, then we feed every other row into the systolic array instead
     - In the default config, `rs1[63:32]` is of type `float32`
 - `rs2[31:0]` = the number of bits by which the accumulated result of a matmul is right-shifted when leaving the systolic array
     - This parameter is only relevant in output-stationary mode, when partial sums must be accumulated within the systolic array itself, and scaled-down when leaving the systolic array and being written into the scratchpad.
-- `rs2[63:32]` = the number of bits by which 6 should be left-shifted before applying relu6
-    - This parameter is ignored if the relu6 activation function is not being used.
 - `funct` = 0
 
 **Action:** mode <= rs1(2); shift <= rs2; A_stride <= rs1[31:16]
@@ -532,6 +531,12 @@ The parameters controlling this feature are:
 
 **Action:** stride <= rs2; max-pooling parameters <= rs1
 
+### `config_norm` configures normalization commands
+**Format:** `config_norm rs1 rs2`
+
+`config_norm` is an **experimental** command added primarily to support an integer-only variant of BERT called [I-BERT](https://arxiv.org/abs/2101.01321) on Gemmini.
+The command allows users to set scalar constants that are used by I-BERT's GELU, layernorm, softmax variants.
+
 ### `flush` flushes the TLB
 **Format:** `flush rs1`
 - `rs1` = If `rs1[0]` is 1, then the current TLB request is skipped (if it has hit a page-fault and is waiting for an interrupt).

From 2f85926d0d08f21aebd302163b0c6cafe43dc484 Mon Sep 17 00:00:00 2001
From: Hasan Genc <hngenc@berkeley.edu>
Date: Sun, 27 Nov 2022 11:09:55 -0800
Subject: [PATCH 15/64] bump onnxruntime-riscv

---
 software/onnxruntime-riscv | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/software/onnxruntime-riscv b/software/onnxruntime-riscv
index daa8999f..f6d2fc95 160000
--- a/software/onnxruntime-riscv
+++ b/software/onnxruntime-riscv
@@ -1 +1 @@
-Subproject commit daa8999f80d5a233b6c478039f548751b7f02f38
+Subproject commit f6d2fc95463316ec47d7f832f35be03c26887922

From 0d863352425968d70b0454ce0236bcfd3372d16f Mon Sep 17 00:00:00 2001
From: Hasan Genc <hngenc@berkeley.edu>
Date: Sun, 27 Nov 2022 11:16:53 -0800
Subject: [PATCH 16/64] update readme

---
 README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 1e4f78bc..20f136d3 100644
--- a/README.md
+++ b/README.md
@@ -22,7 +22,7 @@ We provide here a quick guide to installing Gemmini's dependencies (Chipyard and
 Dependencies
 ---------
 
-Before beginning, install the [Chipyard dependencies](https://chipyard.readthedocs.io/en/latest/Chipyard-Basics/Initial-Repo-Setup.html#requirements) that are described here.
+Before beginning, install the [Chipyard dependencies](https://chipyard.readthedocs.io/en/latest/Chipyard-Basics/Initial-Repo-Setup.html#default-requirements-installation).
 
 Installing Chipyard and Spike
 -----------------------------
@@ -144,7 +144,7 @@ cd chipyard/generators/gemmini
 Next steps
 --------
 
-Check out [our IISWC 2021 tutorial](https://sites.google.com/berkeley.edu/gemminitutorialiiswc2021/) to learn how to:
+Check out our [MLSys 2022 tutorial](https://sites.google.com/berkeley.edu/gemmini-tutorial-mlsys-2022) (or our earlier but more out-of-date [IISWC 2021 tutorial](https://sites.google.com/berkeley.edu/gemminitutorialiiswc2021/)) to learn how to:
 * build different types of diverse accelerators using Gemmini.
 * add custom datatypes to Gemmini.
 * write your own Gemmini programs.
@@ -535,7 +535,7 @@ The parameters controlling this feature are:
 **Format:** `config_norm rs1 rs2`
 
 `config_norm` is an **experimental** command added primarily to support an integer-only variant of BERT called [I-BERT](https://arxiv.org/abs/2101.01321) on Gemmini.
-The command allows users to set scalar constants that are used by I-BERT's GELU, layernorm, softmax variants.
+The command allows users to set scalar constants that are used by I-BERT's GELU, layernorm, and softmax variants.
 
 ### `flush` flushes the TLB
 **Format:** `flush rs1`

From d3a58f4c978c3a5a84ef9950fa9d4339395d37fe Mon Sep 17 00:00:00 2001
From: gnodipac886 <44887166+gnodipac886@users.noreply.github.com>
Date: Thu, 29 Dec 2022 22:21:43 -0500
Subject: [PATCH 17/64] reduce PE area by using a single MAC unit per PE (#265)

PE was not synthesizing properly on the FPGA since it was synthesizing multiple mac units in a single PE.
I added a new mac unit class and instantiate a single mac unit per PE to reduce area over head.

This addresses #262

Co-authored-by: Hasan Genc <hngenc@berkeley.edu>
Co-authored-by: Eric Dong <ericdong@g.harvard.edu>
---
 README.md                       |  8 +------
 src/main/scala/gemmini/PE.scala | 40 +++++++++++++++++++++++++++++----
 2 files changed, 37 insertions(+), 11 deletions(-)

diff --git a/README.md b/README.md
index e96c9177..5f310564 100644
--- a/README.md
+++ b/README.md
@@ -2,12 +2,6 @@
   <img width="1000" src="./img/full-logo.svg">
 </p>
 
-Upcoming Tutorial
-===============================
-We will be presenting [a new tutorial](https://sites.google.com/berkeley.edu/gemmini-tutorial-mlsys-2022) for Gemmini at MLSys 2022, on August 29th, 2022.
-
-If you would like to attend, **then please register online** [at this link](https://docs.google.com/forms/d/1bdIXegBkEMJY88YuD80HN40haZ9tx_bZgmaN3FON5DI/edit). We're looking forward to meeting you all!
-
 Gemmini
 ====================================
 
@@ -52,7 +46,7 @@ SPIKE_HASH=$(cat SPIKE.hash)
 
 cd -
 cd toolchains/esp-tools/riscv-isa-sim/build
-git checkout $SPIKE_HASH
+git fetch && git checkout $SPIKE_HASH
 make && make install
 
 # The final step is only necessary if you want to run MIDAS simulations with
diff --git a/src/main/scala/gemmini/PE.scala b/src/main/scala/gemmini/PE.scala
index 5f7205bd..9518942f 100644
--- a/src/main/scala/gemmini/PE.scala
+++ b/src/main/scala/gemmini/PE.scala
@@ -11,6 +11,18 @@ class PEControl[T <: Data : Arithmetic](accType: T) extends Bundle {
 
 }
 
+class MacUnit[T <: Data](inputType: T, cType: T, dType: T) (implicit ev: Arithmetic[T]) extends Module {
+  import ev._
+  val io = IO(new Bundle {
+    val in_a  = Input(inputType)
+    val in_b  = Input(inputType)
+    val in_c  = Input(cType)
+    val out_d = Output(dType)
+  })
+
+  io.out_d := io.in_c.mac(io.in_a, io.in_b)
+}
+
 // TODO update documentation
 /**
   * A PE implementing a MAC operation. Configured as fully combinational when integrated into a Mesh.
@@ -45,6 +57,12 @@ class PE[T <: Data](inputType: T, outputType: T, accType: T, df: Dataflow.Value,
 
   val cType = if (df == Dataflow.WS) inputType else accType
 
+  // When creating PEs that support multiple dataflows, the
+  // elaboration/synthesis tools often fail to consolidate and de-duplicate
+  // MAC units. To force mac circuitry to be re-used, we create a "mac_unit"
+  // module here which just performs a single MAC operation
+  val mac_unit = Module(new MacUnit(inputType, cType, outputType))
+
   val a  = io.in_a
   val b  = io.in_b
   val d  = io.in_d
@@ -65,6 +83,8 @@ class PE[T <: Data](inputType: T, outputType: T, accType: T, df: Dataflow.Value,
   io.out_last := last
   io.out_valid := valid
 
+  mac_unit.io.in_a := a
+
   val last_s = RegEnable(prop, valid)
   val flip = last_s =/= prop
   val shift_offset = Mux(flip, shift, 0.U)
@@ -82,22 +102,30 @@ class PE[T <: Data](inputType: T, outputType: T, accType: T, df: Dataflow.Value,
     when(prop === PROPAGATE) {
       io.out_c := (c1 >> shift_offset).clippedToWidthOf(outputType)
       io.out_b := b
-      c2 := c2.mac(a, b.asTypeOf(inputType))
+      mac_unit.io.in_b := b.asTypeOf(inputType)
+      mac_unit.io.in_c := c2
+      c2 := mac_unit.io.out_d
       c1 := d.withWidthOf(cType)
     }.otherwise {
       io.out_c := (c2 >> shift_offset).clippedToWidthOf(outputType)
       io.out_b := b
-      c1 := c1.mac(a, b.asTypeOf(inputType))
+      mac_unit.io.in_b := b.asTypeOf(inputType)
+      mac_unit.io.in_c := c1
+      c1 := mac_unit.io.out_d
       c2 := d.withWidthOf(cType)
     }
   }.elsewhen ((df == Dataflow.WS).B || ((df == Dataflow.BOTH).B && dataflow === WEIGHT_STATIONARY)) {
     when(prop === PROPAGATE) {
       io.out_c := c1
-      io.out_b := b.mac(a, c2.asTypeOf(inputType))
+      mac_unit.io.in_b := c2.asTypeOf(inputType)
+      mac_unit.io.in_c := b
+      io.out_b := mac_unit.io.out_d
       c1 := d
     }.otherwise {
       io.out_c := c2
-      io.out_b := b.mac(a, c1.asTypeOf(inputType))
+      mac_unit.io.in_b := c1.asTypeOf(inputType)
+      mac_unit.io.in_c := b
+      io.out_b := mac_unit.io.out_d
       c2 := d
     }
   }.otherwise {
@@ -105,10 +133,14 @@ class PE[T <: Data](inputType: T, outputType: T, accType: T, df: Dataflow.Value,
     //assert(false.B, "unknown dataflow")
     io.out_c := DontCare
     io.out_b := DontCare
+    mac_unit.io.in_b := b.asTypeOf(inputType)
+    mac_unit.io.in_c := c2
   }
 
   when (!valid) {
     c1 := c1
     c2 := c2
+    mac_unit.io.in_b := DontCare
+    mac_unit.io.in_c := DontCare
   }
 }

From 92972df936bc02608f29c1eb75bd4d146e4d4932 Mon Sep 17 00:00:00 2001
From: SingularityKChen <chency_singularity@163.com>
Date: Fri, 30 Dec 2022 11:26:18 +0800
Subject: [PATCH 18/64] fix: chisel3 pr #2758 (#269)

Remove parenthesized forms of asUInt(), asBool(), asSInt(), orR(), andR(), zext()

Co-authored-by: Abraham Gonzalez <abe.j.gonza@gmail.com>
Co-authored-by: Hasan Genc <hngenc@berkeley.edu>
Co-authored-by: joey0320 <joonho0320@gmail.com>
---
 src/main/scala/gemmini/AccumulatorMem.scala   |   6 +-
 src/main/scala/gemmini/AccumulatorScale.scala |   6 +-
 src/main/scala/gemmini/Arithmetic.scala       |  28 ++--
 src/main/scala/gemmini/BeatMerger.scala       |  16 +--
 src/main/scala/gemmini/DMA.scala              |  20 +--
 .../scala/gemmini/DMACommandTracker.scala     |   2 +-
 src/main/scala/gemmini/DSEConfigs.scala       |   6 +-
 .../scala/gemmini/ExecuteController.scala     |   8 +-
 src/main/scala/gemmini/FrontendTLB.scala      |   2 +-
 src/main/scala/gemmini/Im2Col.scala           |   4 +-
 .../gemmini/InstructionCompression.scala      |   4 +-
 src/main/scala/gemmini/LocalAddr.scala        |   4 +-
 src/main/scala/gemmini/LoopConv.scala         | 126 +++++++++---------
 src/main/scala/gemmini/LoopMatmul.scala       |  56 ++++----
 src/main/scala/gemmini/LoopUnroller.scala     |   4 +-
 src/main/scala/gemmini/MeshWithDelays.scala   |   2 +-
 src/main/scala/gemmini/Normalizer.scala       |  10 +-
 src/main/scala/gemmini/PixelRepeater.scala    |   6 +-
 .../scala/gemmini/ReservationStation.scala    |   2 +-
 src/main/scala/gemmini/Scratchpad.scala       |   6 +-
 src/main/scala/gemmini/StoreController.scala  |   4 +-
 src/main/scala/gemmini/TagQueue.scala         |   2 +-
 src/main/scala/gemmini/TilerScheduler.scala   |   4 +-
 src/main/scala/gemmini/Util.scala             |  12 +-
 src/main/scala/gemmini/XactTracker.scala      |   2 +-
 src/main/scala/gemmini/ZeroWriter.scala       |   2 +-
 26 files changed, 172 insertions(+), 172 deletions(-)

diff --git a/src/main/scala/gemmini/AccumulatorMem.scala b/src/main/scala/gemmini/AccumulatorMem.scala
index dd5ed821..c664bd0f 100644
--- a/src/main/scala/gemmini/AccumulatorMem.scala
+++ b/src/main/scala/gemmini/AccumulatorMem.scala
@@ -230,7 +230,7 @@ class AccumulatorMem[T <: Data, U <: Data](
       val wmask = Mux1H(w_q_head.asBools, w_q.map(_.mask))
       val waddr = Mux1H(w_q_head.asBools, w_q.map(_.addr))
       when (wen) {
-        w_q_head := (w_q_head << 1).asUInt() | w_q_head(nEntries-1)
+        w_q_head := (w_q_head << 1).asUInt | w_q_head(nEntries-1)
         for (i <- 0 until nEntries) {
           when (w_q_head(i)) {
             w_q(i).valid := false.B
@@ -243,7 +243,7 @@ class AccumulatorMem[T <: Data, U <: Data](
       when (w_q_push) {
         assert(!w_q_full || wen, "we ran out of acc-sub-bank write q entries")
 
-        w_q_tail := (w_q_tail << 1).asUInt() | w_q_tail(nEntries-1)
+        w_q_tail := (w_q_tail << 1).asUInt | w_q_tail(nEntries-1)
         for (i <- 0 until nEntries) {
           when (w_q_tail(i)) {
             w_q(i).valid := true.B
@@ -334,7 +334,7 @@ class AccumulatorMem[T <: Data, U <: Data](
   io.write.ready := !block_write_req &&
     !pipelined_writes.map(r => r.valid && r.bits.addr === io.write.bits.addr && io.write.bits.acc).reduce(_||_)
 
-  when (reset.asBool()) {
+  when (reset.asBool) {
     pipelined_writes.foreach(_.valid := false.B)
   }
 
diff --git a/src/main/scala/gemmini/AccumulatorScale.scala b/src/main/scala/gemmini/AccumulatorScale.scala
index 1fdd15fa..bf98a144 100644
--- a/src/main/scala/gemmini/AccumulatorScale.scala
+++ b/src/main/scala/gemmini/AccumulatorScale.scala
@@ -175,7 +175,7 @@ class AccumulatorScale[T <: Data, U <: Data](
           completed_masks(i).foreach(_ := false.B)
         }
       }
-      tail_oh := (tail_oh << 1).asUInt() | tail_oh(nEntries-1)
+      tail_oh := (tail_oh << 1).asUInt | tail_oh(nEntries-1)
     }
 
     val inputs = Seq.fill(width*nEntries) { Wire(Decoupled(new AccScaleDataWithIndex(t, scale_t)(ev))) }
@@ -282,10 +282,10 @@ object AccumulatorScale {
     // qln2_inv / S / (2 ** 16) = 1 / ln2
     // q * qln2_inv = x / S / ln2 * S * (2 ** 16) = x / ln2 * (2 ** 16)
     val neg_q_iexp = neg(q)
-    val z_iexp = (neg_q_iexp * qln2_inv).asUInt().do_>>(16).asTypeOf(q) // q is non-positive
+    val z_iexp = (neg_q_iexp * qln2_inv).asUInt.do_>>(16).asTypeOf(q) // q is non-positive
     val qp_iexp = q.mac(z_iexp, qln2).withWidthOf(q)
     val q_poly_iexp = qc.mac(qp_iexp + qb, qp_iexp + qb).withWidthOf(q)
     // we dont want a rounding shift
-    (q_poly_iexp.asUInt().do_>>(z_iexp.asUInt()(5, 0))).asTypeOf(q)
+    (q_poly_iexp.asUInt.do_>>(z_iexp.asUInt(5, 0))).asTypeOf(q)
   }}
 
diff --git a/src/main/scala/gemmini/Arithmetic.scala b/src/main/scala/gemmini/Arithmetic.scala
index cdd36396..c6792578 100644
--- a/src/main/scala/gemmini/Arithmetic.scala
+++ b/src/main/scala/gemmini/Arithmetic.scala
@@ -62,12 +62,12 @@ object Arithmetic {
 
         // TODO Do we need to explicitly handle the cases where "u" is a small number (like 0)? What is the default behavior here?
         val point_five = Mux(u === 0.U, 0.U, self(u - 1.U))
-        val zeros = Mux(u <= 1.U, 0.U, self.asUInt() & ((1.U << (u - 1.U)).asUInt() - 1.U)) =/= 0.U
+        val zeros = Mux(u <= 1.U, 0.U, self.asUInt & ((1.U << (u - 1.U)).asUInt - 1.U)) =/= 0.U
         val ones_digit = self(u)
 
         val r = point_five & (zeros | ones_digit)
 
-        (self >> u).asUInt() + r
+        (self >> u).asUInt + r
       }
 
       override def >(t: UInt): Bool = self > t
@@ -99,19 +99,19 @@ object Arithmetic {
 
         // TODO Do we need to explicitly handle the cases where "u" is a small number (like 0)? What is the default behavior here?
         val point_five = Mux(u === 0.U, 0.U, self(u - 1.U))
-        val zeros = Mux(u <= 1.U, 0.U, self.asUInt() & ((1.U << (u - 1.U)).asUInt() - 1.U)) =/= 0.U
+        val zeros = Mux(u <= 1.U, 0.U, self.asUInt & ((1.U << (u - 1.U)).asUInt - 1.U)) =/= 0.U
         val ones_digit = self(u)
 
-        val r = (point_five & (zeros | ones_digit)).asBool()
+        val r = (point_five & (zeros | ones_digit)).asBool
 
-        (self >> u).asSInt() + Mux(r, 1.S, 0.S)
+        (self >> u).asSInt + Mux(r, 1.S, 0.S)
       }
 
       override def >(t: SInt): Bool = self > t
 
       override def withWidthOf(t: SInt) = {
         if (self.getWidth >= t.getWidth)
-          self(t.getWidth-1, 0).asSInt()
+          self(t.getWidth-1, 0).asSInt
         else {
           val sign_bits = t.getWidth - self.getWidth
           val sign = self(self.getWidth-1)
@@ -122,7 +122,7 @@ object Arithmetic {
       override def clippedToWidthOf(t: SInt): SInt = {
         val maxsat = ((1 << (t.getWidth-1))-1).S
         val minsat = (-(1 << (t.getWidth-1))).S
-        MuxCase(self, Seq((self > maxsat) -> maxsat, (self < minsat) -> minsat))(t.getWidth-1, 0).asSInt()
+        MuxCase(self, Seq((self > maxsat) -> maxsat, (self < minsat) -> minsat))(t.getWidth-1, 0).asSInt
       }
 
       override def relu: SInt = Mux(self >= 0.S, self, 0.S)
@@ -144,7 +144,7 @@ object Arithmetic {
         def sin_to_float(x: SInt) = {
           val in_to_rec_fn = Module(new INToRecFN(intWidth = self.getWidth, expWidth, sigWidth))
           in_to_rec_fn.io.signedIn := true.B
-          in_to_rec_fn.io.in := x.asUInt()
+          in_to_rec_fn.io.in := x.asUInt
           in_to_rec_fn.io.roundingMode := consts.round_minMag // consts.round_near_maxMag
           in_to_rec_fn.io.detectTininess := consts.tininess_afterRounding
 
@@ -167,7 +167,7 @@ object Arithmetic {
           rec_fn_to_in.io.in := x
           rec_fn_to_in.io.roundingMode := consts.round_minMag // consts.round_near_maxMag
 
-          rec_fn_to_in.io.out.asSInt()
+          rec_fn_to_in.io.out.asSInt
         }
 
         val self_rec = sin_to_float(self)
@@ -207,7 +207,7 @@ object Arithmetic {
         def in_to_float(x: SInt) = {
           val in_to_rec_fn = Module(new INToRecFN(intWidth = self.getWidth, expWidth, sigWidth))
           in_to_rec_fn.io.signedIn := true.B
-          in_to_rec_fn.io.in := x.asUInt()
+          in_to_rec_fn.io.in := x.asUInt
           in_to_rec_fn.io.roundingMode := consts.round_minMag // consts.round_near_maxMag
           in_to_rec_fn.io.detectTininess := consts.tininess_afterRounding
 
@@ -220,7 +220,7 @@ object Arithmetic {
           rec_fn_to_in.io.in := x
           rec_fn_to_in.io.roundingMode := consts.round_minMag // consts.round_near_maxMag
 
-          rec_fn_to_in.io.out.asSInt()
+          rec_fn_to_in.io.out.asSInt
         }
 
         val self_rec = in_to_float(self)
@@ -255,7 +255,7 @@ object Arithmetic {
           def in_to_float(x: SInt) = {
             val in_to_rec_fn = Module(new INToRecFN(intWidth = self.getWidth, expWidth, sigWidth))
             in_to_rec_fn.io.signedIn := true.B
-            in_to_rec_fn.io.in := x.asUInt()
+            in_to_rec_fn.io.in := x.asUInt
             in_to_rec_fn.io.roundingMode := consts.round_near_even // consts.round_near_maxMag
             in_to_rec_fn.io.detectTininess := consts.tininess_afterRounding
 
@@ -291,7 +291,7 @@ object Arithmetic {
           def in_to_float(x: SInt) = {
             val in_to_rec_fn = Module(new INToRecFN(intWidth = self.getWidth, expWidth, sigWidth))
             in_to_rec_fn.io.signedIn := true.B
-            in_to_rec_fn.io.in := x.asUInt()
+            in_to_rec_fn.io.in := x.asUInt
             in_to_rec_fn.io.roundingMode := consts.round_near_even // consts.round_near_maxMag
             in_to_rec_fn.io.detectTininess := consts.tininess_afterRounding
 
@@ -304,7 +304,7 @@ object Arithmetic {
             rec_fn_to_in.io.in := x
             rec_fn_to_in.io.roundingMode := consts.round_minMag
 
-            rec_fn_to_in.io.out.asSInt()
+            rec_fn_to_in.io.out.asSInt
           }
 
           val self_rec = in_to_float(self)
diff --git a/src/main/scala/gemmini/BeatMerger.scala b/src/main/scala/gemmini/BeatMerger.scala
index a6a67dab..e8f22b2a 100644
--- a/src/main/scala/gemmini/BeatMerger.scala
+++ b/src/main/scala/gemmini/BeatMerger.scala
@@ -59,11 +59,11 @@ class BeatMerger[U <: Data](beatBits: Int, maxShift: Int, spadWidth: Int, accWid
   }
 
   val last_sending = bytesSent_next === req.bits.bytes_to_read
-  val last_reading = beatBytes.U >= (1.U << req.bits.lg_len_req).asUInt() - bytesRead
+  val last_reading = beatBytes.U >= (1.U << req.bits.lg_len_req).asUInt - bytesRead
 
   io.req.ready := !req.valid
 
-  io.in.ready := io.req.fire || (req.valid && bytesRead =/= (1.U << req.bits.lg_len_req).asUInt())
+  io.in.ready := io.req.fire || (req.valid && bytesRead =/= (1.U << req.bits.lg_len_req).asUInt)
 
   io.out.valid := req.valid && usefulBytesRead > bytesSent && (usefulBytesRead - bytesSent >= rowBytes ||
     usefulBytesRead === req.bits.bytes_to_read)
@@ -90,7 +90,7 @@ class BeatMerger[U <: Data](beatBits: Int, maxShift: Int, spadWidth: Int, accWid
   io.out.bits.accumulate := req.bits.accumulate
   io.out.bits.has_acc_bitwidth := req.bits.has_acc_bitwidth
 
-  when (bytesRead === (1.U << req.bits.lg_len_req).asUInt() &&
+  when (bytesRead === (1.U << req.bits.lg_len_req).asUInt &&
     bytesSent === req.bits.bytes_to_read) {
     req.pop()
   }
@@ -98,7 +98,7 @@ class BeatMerger[U <: Data](beatBits: Int, maxShift: Int, spadWidth: Int, accWid
   when (io.out.fire) {
     bytesSent := bytesSent_next
 
-    when (last_sending && bytesRead === (1.U << req.bits.lg_len_req).asUInt()) {
+    when (last_sending && bytesRead === (1.U << req.bits.lg_len_req).asUInt) {
       req.pop()
       io.req.ready := true.B
     }
@@ -116,16 +116,16 @@ class BeatMerger[U <: Data](beatBits: Int, maxShift: Int, spadWidth: Int, accWid
     val current_usefulBytesRead = Mux(io.req.fire, 0.U, usefulBytesRead)
     val current_shift = Mux(io.req.fire, io.req.bits.shift, req.bits.shift)
     val current_lg_len_req = Mux(io.req.fire, io.req.bits.lg_len_req, req.bits.lg_len_req)
-    val current_len_req = (1.U << current_lg_len_req).asUInt()
+    val current_len_req = (1.U << current_lg_len_req).asUInt
 
     when (current_shift - current_bytesDiscarded <= beatBytes.U /* &&
       current_bytesRead < current_len_req */
     ) {
       val rshift = (current_shift - current_bytesDiscarded) * 8.U // in bits
       val lshift = current_usefulBytesRead * 8.U // in bits
-      val mask = (~(((~0.U(beatBits.W)) >> rshift) << lshift)).asUInt()
+      val mask = (~(((~0.U(beatBits.W)) >> rshift) << lshift)).asUInt
 
-      buffer := (buffer & mask) | ((io.in.bits >> rshift) << lshift).asUInt()
+      buffer := (buffer & mask) | ((io.in.bits >> rshift) << lshift).asUInt
     }
 
     bytesRead := satAdd(current_bytesRead, beatBytes.U, current_len_req)
@@ -135,7 +135,7 @@ class BeatMerger[U <: Data](beatBits: Int, maxShift: Int, spadWidth: Int, accWid
     }
   }
 
-  when (reset.asBool()) {
+  when (reset.asBool) {
     req.valid := false.B
   }
 }
diff --git a/src/main/scala/gemmini/DMA.scala b/src/main/scala/gemmini/DMA.scala
index 71148b67..729e17c7 100644
--- a/src/main/scala/gemmini/DMA.scala
+++ b/src/main/scala/gemmini/DMA.scala
@@ -380,7 +380,7 @@ class StreamWriter[T <: Data: Arithmetic](nXacts: Int, beatBits: Int, maxBytes:
     // TODO use the same register to hold data_blocks and data_single_block, so that this Mux here is not necessary
     val data_blocks = Reg(Vec(maxBlocks, UInt((inputTypeRowBytes * 8).W)))
     val data_single_block = Reg(UInt(dataWidth.W)) // For data that's just one-block-wide
-    val data = Mux(req.block === 0.U, data_single_block, data_blocks.asUInt())
+    val data = Mux(req.block === 0.U, data_single_block, data_blocks.asUInt)
 
     val bytesSent = Reg(UInt(log2Ceil((dataBytes max maxBytes)+1).W))  // TODO this only needs to count up to (dataBytes/aligned_to), right?
     val bytesLeft = req.len - bytesSent
@@ -390,9 +390,9 @@ class StreamWriter[T <: Data: Arithmetic](nXacts: Int, beatBits: Int, maxBytes:
     val xactId = OHToUInt(xactOnehot)
 
     val xactBusy_fire = WireInit(false.B)
-    val xactBusy_add = Mux(xactBusy_fire, (1.U << xactId).asUInt(), 0.U)
-    val xactBusy_remove = ~Mux(tl.d.fire, (1.U << tl.d.bits.source).asUInt(), 0.U)
-    xactBusy := (xactBusy | xactBusy_add) & xactBusy_remove.asUInt()
+    val xactBusy_add = Mux(xactBusy_fire, (1.U << xactId).asUInt, 0.U)
+    val xactBusy_remove = ~Mux(tl.d.fire, (1.U << tl.d.bits.source).asUInt, 0.U)
+    xactBusy := (xactBusy | xactBusy_add) & xactBusy_remove.asUInt
 
     val state_machine_ready_for_req = WireInit(state === s_idle)
     io.req.ready := state_machine_ready_for_req
@@ -482,15 +482,15 @@ class StreamWriter[T <: Data: Arithmetic](nXacts: Int, beatBits: Int, maxBytes:
       fromSource = RegEnableThru(xactId, state === s_writing_new_block),
       toAddress = 0.U,
       lgSize = lg_write_size,
-      data = (data >> (bytesSent * 8.U)).asUInt()
+      data = (data >> (bytesSent * 8.U)).asUInt
     )._2
 
     val putPartial = edge.Put(
       fromSource = RegEnableThru(xactId, state === s_writing_new_block),
       toAddress = 0.U,
       lgSize = lg_write_size,
-      data = ((data >> (bytesSent * 8.U)) << (write_shift * 8.U)).asUInt(),
-      mask = write_mask.asUInt()
+      data = ((data >> (bytesSent * 8.U)) << (write_shift * 8.U)).asUInt,
+      mask = write_mask.asUInt
     )._2
 
     class TLBundleAWithInfo extends Bundle {
@@ -501,7 +501,7 @@ class StreamWriter[T <: Data: Arithmetic](nXacts: Int, beatBits: Int, maxBytes:
 
     val untranslated_a = Wire(Decoupled(new TLBundleAWithInfo))
     xactBusy_fire := untranslated_a.fire && state === s_writing_new_block
-    untranslated_a.valid := (state === s_writing_new_block || state === s_writing_beats) && !xactBusy.andR()
+    untranslated_a.valid := (state === s_writing_new_block || state === s_writing_beats) && !xactBusy.andR
     untranslated_a.bits.tl_a := Mux(write_full, putFull, putPartial)
     untranslated_a.bits.vaddr := write_vaddr
     untranslated_a.bits.status := req.status
@@ -543,7 +543,7 @@ class StreamWriter[T <: Data: Arithmetic](nXacts: Int, beatBits: Int, maxBytes:
     tl.a.bits := translate_q.io.deq.bits.tl_a
     tl.a.bits.address := RegEnableThru(io.tlb.resp.paddr, RegNext(io.tlb.req.fire))
 
-    tl.d.ready := xactBusy.orR()
+    tl.d.ready := xactBusy.orR
 
     when (untranslated_a.fire) {
       when (state === s_writing_new_block) {
@@ -588,7 +588,7 @@ class StreamWriter[T <: Data: Arithmetic](nXacts: Int, beatBits: Int, maxBytes:
         val v1 = io.req.bits.data.asTypeOf(Vec(cols, inputType))
         val v2 = data_single_block.asTypeOf(Vec(cols, inputType))
         val m = v1.zip(v2)
-        VecInit(m.zipWithIndex.map{case ((x, y), i) => if (i < block_cols) maxOf(x, y) else y}).asUInt()
+        VecInit(m.zipWithIndex.map{case ((x, y), i) => if (i < block_cols) maxOf(x, y) else y}).asUInt
       }
 
       req := io.req.bits
diff --git a/src/main/scala/gemmini/DMACommandTracker.scala b/src/main/scala/gemmini/DMACommandTracker.scala
index 9d4f71e6..a2b5df32 100644
--- a/src/main/scala/gemmini/DMACommandTracker.scala
+++ b/src/main/scala/gemmini/DMACommandTracker.scala
@@ -93,7 +93,7 @@ class DMACommandTracker[T <: Data](val nCmds: Int, val maxBytes: Int, tag_t: =>
     cmds(io.cmd_completed.bits.cmd_id).valid := false.B
   }
 
-  when (reset.asBool()) {
+  when (reset.asBool) {
     cmds.foreach(_.init())
   }
 }
diff --git a/src/main/scala/gemmini/DSEConfigs.scala b/src/main/scala/gemmini/DSEConfigs.scala
index 3ed92c7c..257721ca 100644
--- a/src/main/scala/gemmini/DSEConfigs.scala
+++ b/src/main/scala/gemmini/DSEConfigs.scala
@@ -51,12 +51,12 @@ object DSEBaseConfig {
 
         // TODO Do we need to explicitly handle the cases where "u" is a small number (like 0)? What is the default behavior here?
         val point_five = Mux(u === 0.U, 0.U, t(u - 1.U))
-        val zeros = Mux(u <= 1.U, 0.U, t.asUInt() & ((1.U << (u - 1.U)).asUInt() - 1.U)) =/= 0.U
+        val zeros = Mux(u <= 1.U, 0.U, t.asUInt & ((1.U << (u - 1.U)).asUInt - 1.U)) =/= 0.U
         val ones_digit = t(u)
 
-        val r = (point_five & (zeros | ones_digit)).asBool()
+        val r = (point_five & (zeros | ones_digit)).asBool
 
-        (t >> u).asSInt() + Mux(r, 1.S, 0.S)
+        (t >> u).asSInt + Mux(r, 1.S, 0.S)
       }, 0, UInt(8.W), -1)),
     acc_read_full_width = true,
     acc_read_small_width = true,
diff --git a/src/main/scala/gemmini/ExecuteController.scala b/src/main/scala/gemmini/ExecuteController.scala
index 2ef7fa3f..65add720 100644
--- a/src/main/scala/gemmini/ExecuteController.scala
+++ b/src/main/scala/gemmini/ExecuteController.scala
@@ -801,8 +801,8 @@ class ExecuteController[T <: Data, U <: Data, V <: Data](xLen: Int, tagWidth: In
   mesh_cntl_signals_q.io.enq.bits.first := !a_fire_started && !b_fire_started && !d_fire_started
 
   val readData = VecInit(io.srams.read.map(_.resp.bits.data))
-  val accReadData = if (ex_read_from_acc) VecInit(io.acc.read_resp.map(_.bits.data.asUInt())) else readData
-  val im2ColData = io.im2col.resp.bits.a_im2col.asUInt()
+  val accReadData = if (ex_read_from_acc) VecInit(io.acc.read_resp.map(_.bits.data.asUInt)) else readData
+  val im2ColData = io.im2col.resp.bits.a_im2col.asUInt
 
   val readValid = VecInit(io.srams.read.map(bank => ex_read_from_spad.B && bank.resp.valid && !bank.resp.bits.fromDMA))
   val accReadValid = VecInit(io.acc.read_resp.map(bank => ex_read_from_acc.B && bank.valid && !bank.bits.fromDMA))
@@ -933,7 +933,7 @@ class ExecuteController[T <: Data, U <: Data, V <: Data](xLen: Int, tagWidth: In
     if (ex_write_to_spad) {
       io.srams.write(i).en := start_array_outputting && w_bank === i.U && !write_to_acc && !is_garbage_addr && write_this_row
       io.srams.write(i).addr := w_row
-      io.srams.write(i).data := activated_wdata.asUInt()
+      io.srams.write(i).data := activated_wdata.asUInt
       io.srams.write(i).mask := w_mask.flatMap(b => Seq.fill(inputType.getWidth / (aligned_to * 8))(b))
     } else {
       io.srams.write(i).en := false.B
@@ -993,7 +993,7 @@ class ExecuteController[T <: Data, U <: Data, V <: Data](xLen: Int, tagWidth: In
     complete_bits_count := complete_bits_count + 1.U
   }
 
-  when (reset.asBool()) {
+  when (reset.asBool) {
     // pending_completed_rob_id.valid := false.B
     pending_completed_rob_ids.foreach(_.valid := false.B)
   }
diff --git a/src/main/scala/gemmini/FrontendTLB.scala b/src/main/scala/gemmini/FrontendTLB.scala
index 6e7168e9..63a41496 100644
--- a/src/main/scala/gemmini/FrontendTLB.scala
+++ b/src/main/scala/gemmini/FrontendTLB.scala
@@ -119,7 +119,7 @@ class FrontendTLB(nClients: Int, entries: Int, maxSize: Int, use_tlb_register_fi
     val last_translated_vpn = RegInit(0.U(vaddrBits.W))
     val last_translated_ppn = RegInit(0.U(paddrBits.W))
 
-    val l0_tlb_hit = last_translated_valid && ((client.req.bits.tlb_req.vaddr >> pgIdxBits).asUInt() === (last_translated_vpn >> pgIdxBits).asUInt())
+    val l0_tlb_hit = last_translated_valid && ((client.req.bits.tlb_req.vaddr >> pgIdxBits).asUInt === (last_translated_vpn >> pgIdxBits).asUInt)
     val l0_tlb_paddr = Cat(last_translated_ppn >> pgIdxBits, client.req.bits.tlb_req.vaddr(pgIdxBits-1,0))
 
     val tlb = if (use_shared_tlb) tlbs.head else tlbs(i)
diff --git a/src/main/scala/gemmini/Im2Col.scala b/src/main/scala/gemmini/Im2Col.scala
index a317902b..65c4dbd4 100644
--- a/src/main/scala/gemmini/Im2Col.scala
+++ b/src/main/scala/gemmini/Im2Col.scala
@@ -90,12 +90,12 @@ class Im2Col[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T, U, V
 
 
   //how much horizonal turn we have to compute (input_channel*kernel_dim/16)
-  //val turn = Mux(im2col_width(3,0) === 0.U, (im2col_width >> (log2Up(block_size)).U).asUInt(), (im2col_width >> (log2Up(block_size)).U).asUInt + 1.U)
+  //val turn = Mux(im2col_width(3,0) === 0.U, (im2col_width >> (log2Up(block_size)).U).asUInt, (im2col_width >> (log2Up(block_size)).U).asUInt + 1.U)
   val turn = filter_dim2//Mux(channel(3,0) === 0.U, filter_dim2*channel(6, 4), filter_dim2*channel(6, 4) + 1.U)
 
   //Seah: added for more than 16 rows of output
   //how much vertical turn we have to compute (output_dim/16)
-  //val row_turn = Mux(output_dim(3,0) === 0.U, (output_dim >> (log2Up(block_size)).U).asUInt - 1.U, (output_dim >> (log2Up(block_size)).U).asUInt()) //im2col height
+  //val row_turn = Mux(output_dim(3,0) === 0.U, (output_dim >> (log2Up(block_size)).U).asUInt - 1.U, (output_dim >> (log2Up(block_size)).U).asUInt) //im2col height
   val row_turn = io.req.bits.row_turn
   val row_left = io.req.bits.row_left
 
diff --git a/src/main/scala/gemmini/InstructionCompression.scala b/src/main/scala/gemmini/InstructionCompression.scala
index 96bc77ee..fe6cd3d9 100644
--- a/src/main/scala/gemmini/InstructionCompression.scala
+++ b/src/main/scala/gemmini/InstructionCompression.scala
@@ -40,7 +40,7 @@ class InstCompressor(implicit p: Parameters) extends Module {
     buf(waddr).push(io.in.bits)
   }
 
-  when (reset.asBool()) {
+  when (reset.asBool) {
     buf.foreach(_.valid := false.B)
   }
 }
@@ -79,7 +79,7 @@ class InstDecompressor(rob_entries: Int)(implicit p: Parameters) extends Module
     pushed_preload := false.B
   }
 
-  when (reset.asBool()) {
+  when (reset.asBool) {
     buf.valid := false.B
   }
 }
diff --git a/src/main/scala/gemmini/LocalAddr.scala b/src/main/scala/gemmini/LocalAddr.scala
index b53addea..fdaeffe6 100644
--- a/src/main/scala/gemmini/LocalAddr.scala
+++ b/src/main/scala/gemmini/LocalAddr.scala
@@ -40,8 +40,8 @@ class LocalAddr(sp_banks: Int, sp_bank_entries: Int, acc_banks: Int, acc_bank_en
 
   def is_same_address(other: LocalAddr): Bool = is_acc_addr === other.is_acc_addr && data === other.data
   def is_same_address(other: UInt): Bool = is_same_address(other.asTypeOf(this))
-  def is_garbage(dummy: Int = 0) = is_acc_addr && accumulate && read_full_acc_row && data.andR() &&
-    (if (garbage_bit.getWidth > 0) garbage_bit.asBool() else true.B)
+  def is_garbage(dummy: Int = 0) = is_acc_addr && accumulate && read_full_acc_row && data.andR &&
+    (if (garbage_bit.getWidth > 0) garbage_bit.asBool else true.B)
 
   def +(other: UInt) = {
     require(isPow2(sp_bank_entries)) // TODO remove this requirement
diff --git a/src/main/scala/gemmini/LoopConv.scala b/src/main/scala/gemmini/LoopConv.scala
index 210bcade..53032a51 100644
--- a/src/main/scala/gemmini/LoopConv.scala
+++ b/src/main/scala/gemmini/LoopConv.scala
@@ -174,10 +174,10 @@ class LoopConvLdBias(block_size: Int, coreMaxAddrBits: Int, large_iterator_bitwi
     io.cmd.bits.rs1 := o.dram_addr
     val mvin_cmd_rs2 = Wire(mvin_rs2_t.cloneType)
     mvin_cmd_rs2 := DontCare
-    mvin_cmd_rs2.num_rows := o.I.asUInt()
-    mvin_cmd_rs2.num_cols := o.J.asUInt()
+    mvin_cmd_rs2.num_rows := o.I.asUInt
+    mvin_cmd_rs2.num_cols := o.J.asUInt
     mvin_cmd_rs2.local_addr := cast_to_acc_addr(mvin_cmd_rs2.local_addr, o.spad_addr, accumulate = false.B, read_full = false.B)
-    io.cmd.bits.rs2 := mvin_cmd_rs2.asUInt()
+    io.cmd.bits.rs2 := mvin_cmd_rs2.asUInt
   }
 
   // Sending outputs
@@ -257,8 +257,8 @@ class LoopConvLdInput(block_size: Int, coreMaxAddrBits: Int, large_iterator_bitw
   def undilated(x: UInt): UInt = (x +& req.input_dilated) >> req.input_dilated
 
   // Derived parameters
-  val max_ichs_per_mvin = Mux(ichs < (max_block_len * block_size).U, ichs, (max_block_len * block_size).U).zext()
-  val max_batches_per_mvin = Mux(batches < (max_block_len * block_size).U, batches, (max_block_len * block_size).U).zext()
+  val max_ichs_per_mvin = Mux(ichs < (max_block_len * block_size).U, ichs, (max_block_len * block_size).U).zext
+  val max_batches_per_mvin = Mux(batches < (max_block_len * block_size).U, batches, (max_block_len * block_size).U).zext
   val max_chs_per_mvin = Mux(req.trans_input_3120, max_batches_per_mvin, max_ichs_per_mvin)
 
   // Iterators
@@ -268,34 +268,34 @@ class LoopConvLdInput(block_size: Int, coreMaxAddrBits: Int, large_iterator_bitw
   val ich = Reg(SInt(large_iterator_bitwidth.W))
 
   // Calculated params
-  val irow_padded = irow +& undilated(upad).zext()
-  val icol_padded = icol +& undilated(lpad).zext()
-  val is_zeros = irow < 0.S || irow >= irows_unpadded.zext() || icol < 0.S || icol >= icols_unpadded.zext()
+  val irow_padded = irow +& undilated(upad).zext
+  val icol_padded = icol +& undilated(lpad).zext
+  val is_zeros = irow < 0.S || irow >= irows_unpadded.zext || icol < 0.S || icol >= icols_unpadded.zext
 
   val dram_stride = Mux(req.trans_input_3120, batch_size * (input_w/8).U, in_channels * (input_w/8).U)
 
   // Addresses
-  val dram_offset = Mux(req.trans_input_3120, (((ich * in_dim * in_dim +& irow*in_dim +& icol) * batches +& b) * (input_w/8).U).asUInt(),
-    (((b * in_dim * in_dim +& irow*in_dim +& icol) * in_channels +& ich) * (input_w/8).U).asUInt())
+  val dram_offset = Mux(req.trans_input_3120, (((ich * in_dim * in_dim +& irow*in_dim +& icol) * batches +& b) * (input_w/8).U).asUInt,
+    (((b * in_dim * in_dim +& irow*in_dim +& icol) * in_channels +& ich) * (input_w/8).U).asUInt)
   val dram_addr = Mux(is_zeros, 0.U, req.dram_addr + LoopConv.castDramOffset(dram_offset))
   val spad_addr = Mux(req.trans_input_3120,
     // To prevent Verilator errors, we replace some "/ block_size.U" calls here with ">> log2Up(block_size)"
-    req.addr_start.zext() +& (b >> log2Up(block_size)) * input_spad_stride +& ich * (irows >> req.downsample) * (icols >> req.downsample) +& (irow_padded >> req.downsample) * (icols >> req.downsample) +& (icol_padded >> req.downsample),
-    req.addr_start.zext() +& (ich >> log2Up(block_size)) * input_spad_stride +& b * (irows >> req.downsample) * (icols >> req.downsample) +& (irow_padded >> req.downsample) * (icols >> req.downsample) +& (icol_padded >> req.downsample))
+    req.addr_start.zext +& (b >> log2Up(block_size)) * input_spad_stride +& ich * (irows >> req.downsample) * (icols >> req.downsample) +& (irow_padded >> req.downsample) * (icols >> req.downsample) +& (icol_padded >> req.downsample),
+    req.addr_start.zext +& (ich >> log2Up(block_size)) * input_spad_stride +& b * (irows >> req.downsample) * (icols >> req.downsample) +& (irow_padded >> req.downsample) * (icols >> req.downsample) +& (icol_padded >> req.downsample))
 
   // Sizes
-  val block_size_downsampled = (block_size.U << req.downsample).asUInt().zext()
+  val block_size_downsampled = (block_size.U << req.downsample).asUInt.zext
 
   val I = MuxCase(
-    Mux(icols_unpadded.zext() -& icol > block_size_downsampled, block_size_downsampled, icols_unpadded.zext() -& icol),
+    Mux(icols_unpadded.zext -& icol > block_size_downsampled, block_size_downsampled, icols_unpadded.zext -& icol),
     Seq(
       (icol < 0.S) -> Mux((0.S-&icol) > block_size.S, block_size.S, 0.S-&icol),
-      (icol >= icols_unpadded.zext()) -> Mux(icols_unpadded.zext() +& undilated(rpad).zext() -& icol > block_size.S, block_size.S, icols_unpadded.zext() +& undilated(rpad).zext() -& icol)
+      (icol >= icols_unpadded.zext) -> Mux(icols_unpadded.zext +& undilated(rpad).zext -& icol > block_size.S, block_size.S, icols_unpadded.zext +& undilated(rpad).zext -& icol)
     )
   )
   val K = Mux(req.trans_input_3120,
-    Mux(batches.zext() -& b > max_chs_per_mvin, max_chs_per_mvin, batches.zext() -& b),
-    Mux(ichs.zext() -& ich > max_chs_per_mvin, max_chs_per_mvin, ichs.zext() -& ich))
+    Mux(batches.zext -& b > max_chs_per_mvin, max_chs_per_mvin, batches.zext -& b),
+    Mux(ichs.zext -& ich > max_chs_per_mvin, max_chs_per_mvin, ichs.zext -& ich))
 
   class RoCCCommandWithAddr extends Bundle {
     val cmd = new RoCCCommand
@@ -318,7 +318,7 @@ class LoopConvLdInput(block_size: Int, coreMaxAddrBits: Int, large_iterator_bitw
   config_cmd_rs1.state_id := 0.U
   config_cmd_rs1.shrink := 0.U
   config_cmd_rs1._unused := 1.U
-  config_cmd.rs1 := config_cmd_rs1.asUInt()
+  config_cmd.rs1 := config_cmd_rs1.asUInt
 
   config_cmd.rs2 := dram_stride << req.downsample
 
@@ -348,10 +348,10 @@ class LoopConvLdInput(block_size: Int, coreMaxAddrBits: Int, large_iterator_bitw
     io.cmd.bits.rs1 := o.dram_addr
     val mvin_cmd_rs2 = Wire(mvin_rs2_t.cloneType)
     mvin_cmd_rs2 := DontCare
-    mvin_cmd_rs2.num_rows := (o.I >> req.downsample).asUInt()
-    mvin_cmd_rs2.num_cols := o.K.asUInt()
+    mvin_cmd_rs2.num_rows := (o.I >> req.downsample).asUInt
+    mvin_cmd_rs2.num_cols := o.K.asUInt
     mvin_cmd_rs2.local_addr := cast_to_sp_addr(mvin_cmd_rs2.local_addr, o.spad_addr)
-    io.cmd.bits.rs2 := mvin_cmd_rs2.asUInt()
+    io.cmd.bits.rs2 := mvin_cmd_rs2.asUInt
   }
 
   // Sending outputs
@@ -359,23 +359,23 @@ class LoopConvLdInput(block_size: Int, coreMaxAddrBits: Int, large_iterator_bitw
     when (state === config) {
       state := ld
     }.otherwise {
-      val b_it = Mux(req.trans_input_3120, max_chs_per_mvin.asUInt(), 1.U)
-      val ich_it = Mux(req.trans_input_3120, 1.U, max_chs_per_mvin.asUInt())
+      val b_it = Mux(req.trans_input_3120, max_chs_per_mvin.asUInt, 1.U)
+      val ich_it = Mux(req.trans_input_3120, 1.U, max_chs_per_mvin.asUInt)
 
-      val next_ich = sFloorAdd(ich, ich_it, ichs.zext(), 0.S)
-      val next_icol = sFloorAdd(icol, I.asUInt(), (icols_unpadded +& undilated(rpad)).zext(), 0.S-&undilated(lpad).zext(),
+      val next_ich = sFloorAdd(ich, ich_it, ichs.zext, 0.S)
+      val next_icol = sFloorAdd(icol, I.asUInt, (icols_unpadded +& undilated(rpad)).zext, 0.S-&undilated(lpad).zext,
         next_ich === 0.S)
-      val next_irow = sFloorAdd(irow, 1.U << req.downsample, (irows_unpadded +& undilated(dpad)).zext(), 0.S-&undilated(upad).zext(),
-        next_icol === 0.S-&undilated(lpad).zext() && next_ich === 0.S)
-      val next_b = sFloorAdd(b, b_it, batches.zext(), 0.S,
-        next_irow === 0.S-&undilated(upad).zext() && next_icol === 0.S-&undilated(lpad).zext() && next_ich === 0.S)
+      val next_irow = sFloorAdd(irow, 1.U << req.downsample, (irows_unpadded +& undilated(dpad)).zext, 0.S-&undilated(upad).zext,
+        next_icol === 0.S-&undilated(lpad).zext && next_ich === 0.S)
+      val next_b = sFloorAdd(b, b_it, batches.zext, 0.S,
+        next_irow === 0.S-&undilated(upad).zext && next_icol === 0.S-&undilated(lpad).zext && next_ich === 0.S)
 
       ich := next_ich
       icol := next_icol
       irow := next_irow
       b := next_b
 
-      state := Mux(next_b === 0.S && next_irow === 0.S-&undilated(upad).zext() && next_icol === 0.S-&undilated(lpad).zext() && next_ich === 0.S,
+      state := Mux(next_b === 0.S && next_irow === 0.S-&undilated(upad).zext && next_icol === 0.S-&undilated(lpad).zext && next_ich === 0.S,
         idle, ld)
     }
   }
@@ -385,8 +385,8 @@ class LoopConvLdInput(block_size: Int, coreMaxAddrBits: Int, large_iterator_bitw
     req := io.req.bits
     state := config
     b := 0.S
-    irow := 0.S -& ((io.req.bits.inner_bounds.upad +& io.req.bits.input_dilated) >> io.req.bits.input_dilated).zext()
-    icol := 0.S -& ((io.req.bits.inner_bounds.lpad +& io.req.bits.input_dilated) >> io.req.bits.input_dilated).zext()
+    irow := 0.S -& ((io.req.bits.inner_bounds.upad +& io.req.bits.input_dilated) >> io.req.bits.input_dilated).zext
+    icol := 0.S -& ((io.req.bits.inner_bounds.lpad +& io.req.bits.input_dilated) >> io.req.bits.input_dilated).zext
     ich := 0.S
   }
 }
@@ -530,7 +530,7 @@ class LoopConvLdWeight(block_size: Int, coreMaxAddrBits: Int, large_iterator_bit
     mvin_cmd_rs2.num_rows := o.K
     mvin_cmd_rs2.num_cols := o.J
     mvin_cmd_rs2.local_addr := cast_to_sp_addr(mvin_cmd_rs2.local_addr, o.spad_addr)
-    io.cmd.bits.rs2 := mvin_cmd_rs2.asUInt()
+    io.cmd.bits.rs2 := mvin_cmd_rs2.asUInt
   }
 
   // Sending outputs
@@ -633,8 +633,8 @@ class LoopConvExecute(block_size: Int, large_iterator_bitwidth: Int, small_itera
   val ocol = Reg(UInt(small_iterator_bitwidth.W))
 
   // TODO kernel-dilation and input-dilation can never be activated at the same time, so we can optimize out some multiplications by kernel_dilation
-  val skip_iteration = state >= pre && req.input_dilated && (((krow * kernel_dilation +& orow -& upad)(0) & req.input_dilated).asBool() ||
-    ((kcol * kernel_dilation +& ocol -& lpad)(0) & req.input_dilated).asBool())
+  val skip_iteration = state >= pre && req.input_dilated && (((krow * kernel_dilation +& orow -& upad)(0) & req.input_dilated).asBool ||
+    ((kcol * kernel_dilation +& ocol -& lpad)(0) & req.input_dilated).asBool)
 
   val pixels = Mux(kcols - kcol > req.max_pixels_per_row, req.max_pixels_per_row, kcols - kcol)
 
@@ -643,7 +643,7 @@ class LoopConvExecute(block_size: Int, large_iterator_bitwidth: Int, small_itera
 
   val I = Mux(req.trans_input_3120,
     Mux(batches - b > block_size.U, block_size.U, batches - b),
-    undilated(Mux(ocols - ocol > (block_size.U << req.input_dilated).asUInt(), (block_size.U << req.input_dilated).asUInt(), ocols - ocol)))
+    undilated(Mux(ocols - ocol > (block_size.U << req.input_dilated).asUInt, (block_size.U << req.input_dilated).asUInt, ocols - ocol)))
   val J = Mux(ochs - och > block_size.U, block_size.U, ochs - och)
   val K = pixels * Mux(kchs - kch > block_size.U, block_size.U, kchs - kch)
 
@@ -687,16 +687,16 @@ class LoopConvExecute(block_size: Int, large_iterator_bitwidth: Int, small_itera
 
   val config_cmd_rs1 = Wire(config_ex_rs1_t.cloneType)
   config_cmd_rs1 := DontCare
-  config_cmd_rs1.a_stride := (irows * icols).asUInt()
+  config_cmd_rs1.a_stride := (irows * icols).asUInt
   config_cmd_rs1.set_only_strides := 1.U
   config_cmd_rs1.cmd_type := 0.U
 
   val config_cmd_rs2 = Wire(new ConfigExRs2)
   config_cmd_rs2 := DontCare
-  config_cmd_rs2.c_stride := (orows * ocols).asUInt()
+  config_cmd_rs2.c_stride := (orows * ocols).asUInt
 
-  config_cmd.rs1 := config_cmd_rs1.asUInt()
-  config_cmd.rs2 := config_cmd_rs2.asUInt()
+  config_cmd.rs1 := config_cmd_rs1.asUInt
+  config_cmd.rs2 := config_cmd_rs2.asUInt
 
   val pre_cmd = Wire(new RoCCCommand) // preload
   pre_cmd := DontCare
@@ -735,35 +735,35 @@ class LoopConvExecute(block_size: Int, large_iterator_bitwidth: Int, small_itera
 
     val pre_cmd_rs1 = Wire(preload_rs1_t.cloneType)
     pre_cmd_rs1 := DontCare
-    pre_cmd_rs1.num_rows := o.K.asUInt()
-    pre_cmd_rs1.num_cols := o.J.asUInt()
+    pre_cmd_rs1.num_rows := o.K.asUInt
+    pre_cmd_rs1.num_cols := o.J.asUInt
     pre_cmd_rs1.local_addr := Mux(o.new_weights, cast_to_sp_addr(pre_cmd_rs1.local_addr, o.b_addr),
       garbage_addr(pre_cmd_rs1.local_addr))
 
     val pre_cmd_rs2 = Wire(preload_rs2_t.cloneType)
     pre_cmd_rs2 := DontCare
-    pre_cmd_rs2.num_rows := o.I.asUInt()
-    pre_cmd_rs2.num_cols := o.J.asUInt()
+    pre_cmd_rs2.num_rows := o.I.asUInt
+    pre_cmd_rs2.num_cols := o.J.asUInt
     pre_cmd_rs2.local_addr := cast_to_acc_addr(pre_cmd_rs2.local_addr, o.c_addr, accumulate = true.B, read_full = false.B)
 
-    io.cmd.bits.rs1 := pre_cmd_rs1.asUInt()
-    io.cmd.bits.rs2 := pre_cmd_rs2.asUInt()
+    io.cmd.bits.rs1 := pre_cmd_rs1.asUInt
+    io.cmd.bits.rs2 := pre_cmd_rs2.asUInt
   }.elsewhen(command_p.io.out.bits.cmd.inst.funct =/= CONFIG_CMD) {
     val o = command_p.io.out.bits
     val comp_cmd_rs1 = Wire(compute_rs1_t.cloneType)
     comp_cmd_rs1 := DontCare
-    comp_cmd_rs1.num_rows := o.I.asUInt()
-    comp_cmd_rs1.num_cols := o.K.asUInt()
+    comp_cmd_rs1.num_rows := o.I.asUInt
+    comp_cmd_rs1.num_cols := o.K.asUInt
     comp_cmd_rs1.local_addr := cast_to_sp_addr(comp_cmd_rs1.local_addr, o.a_addr)
 
     val comp_cmd_rs2 = Wire(compute_rs2_t.cloneType)
     comp_cmd_rs2 := DontCare
-    comp_cmd_rs2.num_rows := o.I.asUInt()
-    comp_cmd_rs2.num_cols := o.J.asUInt()
+    comp_cmd_rs2.num_rows := o.I.asUInt
+    comp_cmd_rs2.num_cols := o.J.asUInt
     comp_cmd_rs2.local_addr := garbage_addr(comp_cmd_rs2.local_addr)
 
-    io.cmd.bits.rs1 := comp_cmd_rs1.asUInt()
-    io.cmd.bits.rs2 := comp_cmd_rs2.asUInt()
+    io.cmd.bits.rs1 := comp_cmd_rs1.asUInt
+    io.cmd.bits.rs2 := comp_cmd_rs2.asUInt
   }
 
   // Updating "new_weights"
@@ -779,7 +779,7 @@ class LoopConvExecute(block_size: Int, large_iterator_bitwidth: Int, small_itera
       state := comp
     }.otherwise {
       val b_it = Mux(req.trans_input_3120, block_size.U, 1.U)
-      val ocol_it = Mux(skip_iteration || req.trans_input_3120, 1.U, block_size.U << req.input_dilated).asUInt()
+      val ocol_it = Mux(skip_iteration || req.trans_input_3120, 1.U, block_size.U << req.input_dilated).asUInt
 
       val next_ocol = floorAdd(ocol, ocol_it, ocols)
       val next_orow = floorAdd(orow, 1.U, orows, next_ocol === 0.U)
@@ -928,13 +928,13 @@ class LoopConvSt(block_size: Int, coreMaxAddrBits: Int, large_iterator_bitwidth:
   pre_pool_config_cmd_rs1.pool_stride := pool_stride
   pre_pool_config_cmd_rs1.activation := req.activation
   pre_pool_config_cmd_rs1.cmd_type := CONFIG_STORE
-  pre_pool_config_cmd.rs1 := pre_pool_config_cmd_rs1.asUInt()
+  pre_pool_config_cmd.rs1 := pre_pool_config_cmd_rs1.asUInt
 
   val pre_pool_config_cmd_rs2 = Wire(config_mvout_rs2_t.cloneType)
   pre_pool_config_cmd_rs2 := DontCare
   pre_pool_config_cmd_rs2.acc_scale := ACC_SCALE_NO_CHANGE
   pre_pool_config_cmd_rs2.stride := out_channels * (input_w / 8).U
-  pre_pool_config_cmd.rs2 := pre_pool_config_cmd_rs2.asUInt()
+  pre_pool_config_cmd.rs2 := pre_pool_config_cmd_rs2.asUInt
 
   val post_pool_config_cmd = Wire(new RoCCCommand)
   post_pool_config_cmd := DontCare
@@ -944,13 +944,13 @@ class LoopConvSt(block_size: Int, coreMaxAddrBits: Int, large_iterator_bitwidth:
   post_pool_config_cmd_rs1 := DontCare
   post_pool_config_cmd_rs1.activation := req.activation
   post_pool_config_cmd_rs1.cmd_type := CONFIG_STORE
-  post_pool_config_cmd.rs1 := post_pool_config_cmd_rs1.asUInt()
+  post_pool_config_cmd.rs1 := post_pool_config_cmd_rs1.asUInt
 
   val post_pool_config_cmd_rs2 = Wire(config_mvout_rs2_t.cloneType)
   post_pool_config_cmd_rs2 := DontCare
   post_pool_config_cmd_rs2.acc_scale := ACC_SCALE_NO_CHANGE
   post_pool_config_cmd_rs2.stride := out_channels * (input_w / 8).U
-  post_pool_config_cmd.rs2 := post_pool_config_cmd_rs2.asUInt()
+  post_pool_config_cmd.rs2 := post_pool_config_cmd_rs2.asUInt
 
   val pool_cmd = Wire(new RoCCCommand)
   pool_cmd := DontCare
@@ -990,16 +990,16 @@ class LoopConvSt(block_size: Int, coreMaxAddrBits: Int, large_iterator_bitwidth:
       pool_mvout_cmd_rs2.local_addr := cast_to_acc_addr(pool_mvout_cmd_rs2.local_addr, o.pool_spad_addr, accumulate = false.B, read_full = false.B)
 
       io.cmd.bits.rs1 := o.pool_dram_addr
-      io.cmd.bits.rs2 := pool_mvout_cmd_rs2.asUInt()
+      io.cmd.bits.rs2 := pool_mvout_cmd_rs2.asUInt
     } .otherwise {
       val mvout_cmd_rs2 = Wire(mvout_rs2_t.cloneType)
       mvout_cmd_rs2 := DontCare
-      mvout_cmd_rs2.num_rows := o.I.asUInt()
-      mvout_cmd_rs2.num_cols := o.J.asUInt()
+      mvout_cmd_rs2.num_rows := o.I.asUInt
+      mvout_cmd_rs2.num_cols := o.J.asUInt
       mvout_cmd_rs2.local_addr := cast_to_acc_addr(mvout_cmd_rs2.local_addr, o.spad_addr, accumulate = false.B, read_full = false.B)
 
       io.cmd.bits.rs1 := o.dram_addr
-      io.cmd.bits.rs2 := mvout_cmd_rs2.asUInt()
+      io.cmd.bits.rs2 := mvout_cmd_rs2.asUInt
     }
   }
 
@@ -1182,7 +1182,7 @@ class LoopConv (block_size: Int, coreMaxAddrBits: Int, reservation_station_size:
   val concurrent_loops = 2
   val loops = Reg(Vec(concurrent_loops, new LoopConvState(block_size, large_iterator_bitwidth, small_iterator_bitwidth, tiny_iterator_bitwidth, coreMaxAddrBits, max_addr, max_acc_addr)))
   val head_loop_id = RegInit(0.U(log2Up(concurrent_loops).W))
-  val tail_loop_id = (~head_loop_id).asUInt() // This is the loop that we always try to configure if available
+  val tail_loop_id = (~head_loop_id).asUInt // This is the loop that we always try to configure if available
   val head_loop = loops(head_loop_id)
   val tail_loop = loops(tail_loop_id)
 
@@ -1499,7 +1499,7 @@ class LoopConv (block_size: Int, coreMaxAddrBits: Int, reservation_station_size:
   }
 
   // Resets
-  when (reset.asBool()) {
+  when (reset.asBool) {
     loops.zipWithIndex.foreach { case (l, i) =>
       l.reset()
       l.a_addr_start := (i * (max_addr / concurrent_loops)).U
diff --git a/src/main/scala/gemmini/LoopMatmul.scala b/src/main/scala/gemmini/LoopMatmul.scala
index 86552d56..a33155e9 100644
--- a/src/main/scala/gemmini/LoopMatmul.scala
+++ b/src/main/scala/gemmini/LoopMatmul.scala
@@ -76,10 +76,10 @@ class LoopMatmulLdA(block_size: Int, coreMaxAddrBits: Int, iterator_bitwidth: In
 
   val mvin_cmd_rs2 = Wire(mvin_rs2_t.cloneType)
   mvin_cmd_rs2 := DontCare
-  mvin_cmd_rs2.num_rows := rows.asUInt()
-  mvin_cmd_rs2.num_cols := cols.asUInt()
+  mvin_cmd_rs2.num_rows := rows.asUInt
+  mvin_cmd_rs2.num_cols := cols.asUInt
   mvin_cmd_rs2.local_addr := cast_to_sp_addr(mvin_cmd_rs2.local_addr, sp_addr)
-  mvin_cmd.rs2 := mvin_cmd_rs2.asUInt()
+  mvin_cmd.rs2 := mvin_cmd_rs2.asUInt
 
   io.req.ready := state === idle
   io.i := i
@@ -184,10 +184,10 @@ class LoopMatmulLdB(block_size: Int, coreMaxAddrBits: Int, iterator_bitwidth: In
 
   val mvin_cmd_rs2 = Wire(mvin_rs2_t.cloneType)
   mvin_cmd_rs2 := DontCare
-  mvin_cmd_rs2.num_rows := rows.asUInt()
-  mvin_cmd_rs2.num_cols := cols.asUInt()
+  mvin_cmd_rs2.num_rows := rows.asUInt
+  mvin_cmd_rs2.num_cols := cols.asUInt
   mvin_cmd_rs2.local_addr := cast_to_sp_addr(mvin_cmd_rs2.local_addr, sp_addr)
-  mvin_cmd.rs2 := mvin_cmd_rs2.asUInt()
+  mvin_cmd.rs2 := mvin_cmd_rs2.asUInt
 
   io.req.ready := state === idle
   io.k := k
@@ -281,10 +281,10 @@ class LoopMatmulLdD(block_size: Int, coreMaxAddrBits: Int, iterator_bitwidth: In
 
   val mvin_cmd_rs2 = Wire(mvin_rs2_t.cloneType)
   mvin_cmd_rs2 := DontCare
-  mvin_cmd_rs2.num_rows := rows.asUInt()
-  mvin_cmd_rs2.num_cols := cols.asUInt()
+  mvin_cmd_rs2.num_rows := rows.asUInt
+  mvin_cmd_rs2.num_cols := cols.asUInt
   mvin_cmd_rs2.local_addr := cast_to_acc_addr(mvin_cmd_rs2.local_addr, sp_addr, accumulate = false.B, read_full = false.B)
-  mvin_cmd.rs2 := mvin_cmd_rs2.asUInt()
+  mvin_cmd.rs2 := mvin_cmd_rs2.asUInt
 
   io.req.ready := state === idle
   io.idle := state === idle
@@ -401,19 +401,19 @@ class LoopMatmulExecute(block_size: Int, coreMaxAddrBits: Int, iterator_bitwidth
 
   val pre_cmd_rs1 = Wire(preload_rs1_t.cloneType)
   pre_cmd_rs1 := DontCare
-  pre_cmd_rs1.num_rows := b_rows.asUInt()
-  pre_cmd_rs1.num_cols := b_cols.asUInt()
+  pre_cmd_rs1.num_rows := b_rows.asUInt
+  pre_cmd_rs1.num_cols := b_cols.asUInt
   pre_cmd_rs1.local_addr := Mux(i === 0.U, cast_to_sp_addr(pre_cmd_rs1.local_addr, b_addr),
     garbage_addr(pre_cmd_rs1.local_addr))
 
   val pre_cmd_rs2 = Wire(preload_rs2_t.cloneType)
   pre_cmd_rs2 := DontCare
-  pre_cmd_rs2.num_rows := c_rows.asUInt()
-  pre_cmd_rs2.num_cols := c_cols.asUInt()
+  pre_cmd_rs2.num_rows := c_rows.asUInt
+  pre_cmd_rs2.num_cols := c_cols.asUInt
   pre_cmd_rs2.local_addr := cast_to_acc_addr(pre_cmd_rs2.local_addr, c_addr, accumulate = req.accumulate || k =/= 0.U, read_full = false.B)
 
-  pre_cmd.rs1 := pre_cmd_rs1.asUInt()
-  pre_cmd.rs2 := pre_cmd_rs2.asUInt()
+  pre_cmd.rs1 := pre_cmd_rs1.asUInt
+  pre_cmd.rs2 := pre_cmd_rs2.asUInt
 
   val comp_cmd = Wire(new RoCCCommand())
   comp_cmd := DontCare
@@ -421,8 +421,8 @@ class LoopMatmulExecute(block_size: Int, coreMaxAddrBits: Int, iterator_bitwidth
 
   val comp_cmd_rs1 = Wire(compute_rs1_t.cloneType)
   comp_cmd_rs1 := DontCare
-  comp_cmd_rs1.num_rows := a_rows.asUInt()
-  comp_cmd_rs1.num_cols := a_cols.asUInt()
+  comp_cmd_rs1.num_rows := a_rows.asUInt
+  comp_cmd_rs1.num_cols := a_cols.asUInt
   comp_cmd_rs1.local_addr := cast_to_sp_addr(comp_cmd_rs1.local_addr, a_addr)
 
   val comp_cmd_rs2 = Wire(compute_rs2_t.cloneType)
@@ -431,8 +431,8 @@ class LoopMatmulExecute(block_size: Int, coreMaxAddrBits: Int, iterator_bitwidth
   comp_cmd_rs2.num_cols := block_size.U
   comp_cmd_rs2.local_addr := garbage_addr(comp_cmd_rs2.local_addr)
 
-  comp_cmd.rs1 := comp_cmd_rs1.asUInt()
-  comp_cmd.rs2 := comp_cmd_rs2.asUInt()
+  comp_cmd.rs1 := comp_cmd_rs1.asUInt
+  comp_cmd.rs2 := comp_cmd_rs2.asUInt
 
   io.req.ready := state === idle
   io.k := k
@@ -528,7 +528,7 @@ class LoopMatmulStC(block_size: Int, coreMaxAddrBits: Int, iterator_bitwidth: In
   val j = Reg(UInt(iterator_bitwidth.W))
   val i = Reg(UInt(iterator_bitwidth.W))
 
-  val acc_addr_start = /*(BigInt(1) << 31).U | (req.full_c << 29.U).asUInt() |*/ req.addr_start
+  val acc_addr_start = /*(BigInt(1) << 31).U | (req.full_c << 29.U).asUInt |*/ req.addr_start
 
   val dram_offset = Mux(req.full_c, (i * req.dram_stride + j) * block_size.U * (acc_w/8).U,
     (i * req.dram_stride + j) * block_size.U * (input_w/8).U)
@@ -545,10 +545,10 @@ class LoopMatmulStC(block_size: Int, coreMaxAddrBits: Int, iterator_bitwidth: In
 
   val mvout_cmd_rs2 = Wire(mvout_rs2_t.cloneType)
   mvout_cmd_rs2 := DontCare
-  mvout_cmd_rs2.num_rows := rows.asUInt()
-  mvout_cmd_rs2.num_cols := cols.asUInt()
+  mvout_cmd_rs2.num_rows := rows.asUInt
+  mvout_cmd_rs2.num_cols := cols.asUInt
   mvout_cmd_rs2.local_addr := cast_to_acc_addr(mvout_cmd_rs2.local_addr, sp_addr, accumulate = false.B, read_full = req.full_c)
-  mvout_cmd.rs2 := mvout_cmd_rs2.asUInt()
+  mvout_cmd.rs2 := mvout_cmd_rs2.asUInt
 
   // Layernorm iterators and calculations
   val ln_row = Reg(UInt(iterator_bitwidth.W))
@@ -585,7 +585,7 @@ class LoopMatmulStC(block_size: Int, coreMaxAddrBits: Int, iterator_bitwidth: In
   val ln_config_norm = Wire(new RoCCCommand)
   ln_config_norm := DontCare
   ln_config_norm.inst.funct := CONFIG_CMD
-  ln_config_norm.rs1 := ln_config_norm_rs1.asUInt()
+  ln_config_norm.rs1 := ln_config_norm_rs1.asUInt
   ln_config_norm.rs2 := DontCare
 
   val ln_mvout_cmd = Wire(new RoCCCommand)
@@ -596,10 +596,10 @@ class LoopMatmulStC(block_size: Int, coreMaxAddrBits: Int, iterator_bitwidth: In
   val ln_mvout_cmd_rs2 = Wire(mvout_rs2_t.cloneType)
   ln_mvout_cmd_rs2 := DontCare
   ln_mvout_cmd_rs2.num_rows := 1.U
-  ln_mvout_cmd_rs2.num_cols := cols.asUInt()
+  ln_mvout_cmd_rs2.num_cols := cols.asUInt
   ln_mvout_cmd_rs2.local_addr := cast_to_acc_addr(ln_mvout_cmd_rs2.local_addr, ln_sp_addr, accumulate = false.B, read_full = req.full_c)
   ln_mvout_cmd_rs2.local_addr.norm_cmd := ln_norm_cmd
-  ln_mvout_cmd.rs2 := ln_mvout_cmd_rs2.asUInt()
+  ln_mvout_cmd.rs2 := ln_mvout_cmd_rs2.asUInt
 
   io.req.ready := state === idle
   io.j := j
@@ -760,7 +760,7 @@ class LoopMatmul(block_size: Int, coreMaxAddrBits: Int, reservation_station_size
   val concurrent_loops = 2
   val loops = Reg(Vec(concurrent_loops, new LoopMatmulState(iterator_bitwidth, coreMaxAddrBits, max_addr, max_acc_addr)))
   val head_loop_id = Reg(UInt(log2Up(concurrent_loops).W))
-  val tail_loop_id = (~head_loop_id).asUInt() // This is the loop that we always try to configure if available
+  val tail_loop_id = (~head_loop_id).asUInt // This is the loop that we always try to configure if available
   val head_loop = loops(head_loop_id)
   val tail_loop = loops(tail_loop_id)
 
@@ -1052,7 +1052,7 @@ class LoopMatmul(block_size: Int, coreMaxAddrBits: Int, reservation_station_size
   }
 
   // Resets
-  when (reset.asBool()) {
+  when (reset.asBool) {
     loops.zipWithIndex.foreach { case (l, i) =>
       l.reset()
       l.a_addr_start := (i * (max_addr / concurrent_loops)).U
diff --git a/src/main/scala/gemmini/LoopUnroller.scala b/src/main/scala/gemmini/LoopUnroller.scala
index 02ac7d71..63a0150b 100644
--- a/src/main/scala/gemmini/LoopUnroller.scala
+++ b/src/main/scala/gemmini/LoopUnroller.scala
@@ -42,8 +42,8 @@ class LoopUnroller(block_size: Int)(implicit p: Parameters) extends Module {
 
   val a_start = cmd.bits.rs1(31, 0)
   val b_start = cmd.bits.rs1(63, 32)
-  val c_start = (3.U << 30).asUInt()
-  val d_start = (1.U << 31).asUInt()
+  val c_start = (3.U << 30).asUInt
+  val d_start = (1.U << 31).asUInt
 
   // TODO get rid of the x * max_y multiplications here
   val a_addr = a_start + (i * max_k + k) * block_size.U
diff --git a/src/main/scala/gemmini/MeshWithDelays.scala b/src/main/scala/gemmini/MeshWithDelays.scala
index d0aced16..516760bf 100644
--- a/src/main/scala/gemmini/MeshWithDelays.scala
+++ b/src/main/scala/gemmini/MeshWithDelays.scala
@@ -248,7 +248,7 @@ class MeshWithDelays[T <: Data: Arithmetic, U <: TagQueueTag with Data]
   io.req.ready := (!req.valid || last_fire) && tagq.io.enq.ready && total_rows_q.io.enq.ready
   io.tags_in_progress := tagq.io.all.map(_.tag)
 
-  when (reset.asBool()) {
+  when (reset.asBool) {
     req.valid := false.B
   }
 
diff --git a/src/main/scala/gemmini/Normalizer.scala b/src/main/scala/gemmini/Normalizer.scala
index 89dca2db..67dd18ac 100644
--- a/src/main/scala/gemmini/Normalizer.scala
+++ b/src/main/scala/gemmini/Normalizer.scala
@@ -351,7 +351,7 @@ class Normalizer[T <: Data, U <: Data](max_len: Int, num_reduce_lanes: Int, num_
     assert(acc_t.isInstanceOf[SInt])
 
     when (stat.state === waiting_for_stddev) {
-      stat.inv_stddev := Mux(sqrt_out.bits.asUInt() === acc_t.zero.asUInt(),
+      stat.inv_stddev := Mux(sqrt_out.bits.asUInt === acc_t.zero.asUInt,
         1.S(acc_t.getWidth.W).asTypeOf(acc_t),
         sqrt_out.bits
       )
@@ -405,14 +405,14 @@ class Normalizer[T <: Data, U <: Data](max_len: Int, num_reduce_lanes: Int, num_
       def in_to_float(x: SInt) = {
         val in_to_rec_fn = Module(new INToRecFN(intWidth = sum_exp_to_inv.getWidth, expWidth, sigWidth))
         in_to_rec_fn.io.signedIn := true.B
-        in_to_rec_fn.io.in := x.asUInt()
+        in_to_rec_fn.io.in := x.asUInt
         in_to_rec_fn.io.roundingMode := consts.round_near_even // consts.round_near_maxMag
         in_to_rec_fn.io.detectTininess := consts.tininess_afterRounding
 
         in_to_rec_fn.io.out
       }
 
-      val self_rec = in_to_float(sum_exp_to_inv.asUInt().asSInt())
+      val self_rec = in_to_float(sum_exp_to_inv.asUInt.asSInt)
       val one_rec = in_to_float(127.S) // softmax maximum is 127 for signed int8
 
       // Instantiate the hardloat divider
@@ -436,7 +436,7 @@ class Normalizer[T <: Data, U <: Data](max_len: Int, num_reduce_lanes: Int, num_
     val stat = stats(sum_exp_to_inv_id)
 
     exp_divider_in.valid := (stat.state === get_inv_sum_exp) && !lanes.io.busy
-    exp_divider_in.bits := sum_exp_to_inv.asUInt()
+    exp_divider_in.bits := sum_exp_to_inv.asUInt
   }
 
   {
@@ -587,7 +587,7 @@ class Normalizer[T <: Data, U <: Data](max_len: Int, num_reduce_lanes: Int, num_
   assert(acc_t.getWidth == scale_t.getWidth, "we use the same variable to hold both the variance and the inv-stddev, so we need them to see the width")
 
   // Resets
-  when (reset.asBool()) {
+  when (reset.asBool) {
     stats.foreach(_.state := idle)
     stats.foreach(_.sum := acc_t.zero)
     stats.foreach(_.max := acc_t.minimum)
diff --git a/src/main/scala/gemmini/PixelRepeater.scala b/src/main/scala/gemmini/PixelRepeater.scala
index ddab4422..ecf9481c 100644
--- a/src/main/scala/gemmini/PixelRepeater.scala
+++ b/src/main/scala/gemmini/PixelRepeater.scala
@@ -48,8 +48,8 @@ class PixelRepeater[T <: Data, Tag <: Data](t: T, laddr_t: LocalAddr, block_cols
     val out_shift = Wire(UInt(log2Up(block_cols / 2 + 1).W))
     out_shift := req.bits.pixel_repeats * req.bits.len
 
-    io.resp.bits.out := (req.bits.in.asUInt() << (out_shift * t.getWidth.U)).asTypeOf(io.resp.bits.out)
-    io.resp.bits.mask := (req.bits.mask.asUInt() << (out_shift * ((t.getWidth / 8) / aligned_to).U)).asTypeOf(io.resp.bits.mask)
+    io.resp.bits.out := (req.bits.in.asUInt << (out_shift * t.getWidth.U)).asTypeOf(io.resp.bits.out)
+    io.resp.bits.mask := (req.bits.mask.asUInt << (out_shift * ((t.getWidth / 8) / aligned_to).U)).asTypeOf(io.resp.bits.mask)
 
     io.resp.bits.last := req.bits.last && (req.bits.pixel_repeats === 0.U)
     io.resp.bits.tag := req.bits.tag
@@ -84,7 +84,7 @@ class PixelRepeater[T <: Data, Tag <: Data](t: T, laddr_t: LocalAddr, block_cols
       req.bits.pixel_repeats := io.req.bits.pixel_repeats - 1.U
     }
 
-    when(reset.asBool()) {
+    when(reset.asBool) {
       req.pop()
     }
   }
diff --git a/src/main/scala/gemmini/ReservationStation.scala b/src/main/scala/gemmini/ReservationStation.scala
index 68d0e6e7..47dd5ef1 100644
--- a/src/main/scala/gemmini/ReservationStation.scala
+++ b/src/main/scala/gemmini/ReservationStation.scala
@@ -558,7 +558,7 @@ class ReservationStation[T <: Data : Arithmetic, U <: Data, V <: Data](config: G
     PerfCounter(!io.alloc.ready, "reservation_station_full", "cycles where reservation station is full")
   }
 
-  when (reset.asBool()) {
+  when (reset.asBool) {
     entries.foreach(_.valid := false.B)
   }
 
diff --git a/src/main/scala/gemmini/Scratchpad.scala b/src/main/scala/gemmini/Scratchpad.scala
index 70c9140f..91870c80 100644
--- a/src/main/scala/gemmini/Scratchpad.scala
+++ b/src/main/scala/gemmini/Scratchpad.scala
@@ -149,9 +149,9 @@ class ScratchpadBank(n: Int, w: Int, aligned_to: Int, single_ported: Boolean, us
   val ren = io.read.req.fire
   val rdata = if (single_ported) {
     assert(!(ren && io.write.en))
-    read(raddr, ren && !io.write.en).asUInt()
+    read(raddr, ren && !io.write.en).asUInt
   } else {
-    read(raddr, ren).asUInt()
+    read(raddr, ren).asUInt
   }
 
   val fromDMA = io.read.req.bits.fromDMA
@@ -537,7 +537,7 @@ class Scratchpad[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T,
           bio.write.mask := io.srams.write(i).mask
         }.elsewhen (dmaread) {
           bio.write.addr := laddr.sp_row()
-          bio.write.data := mvin_scale_pixel_repeater.io.resp.bits.out.asUInt()
+          bio.write.data := mvin_scale_pixel_repeater.io.resp.bits.out.asUInt
           bio.write.mask := mvin_scale_pixel_repeater.io.resp.bits.mask take ((spad_w / (aligned_to * 8)) max 1)
 
           mvin_scale_pixel_repeater.io.resp.ready := true.B // TODO we combinationally couple valid and ready signals
diff --git a/src/main/scala/gemmini/StoreController.scala b/src/main/scala/gemmini/StoreController.scala
index c9e4fdbb..bf4a71b2 100644
--- a/src/main/scala/gemmini/StoreController.scala
+++ b/src/main/scala/gemmini/StoreController.scala
@@ -226,7 +226,7 @@ class StoreController[T <: Data : Arithmetic, U <: Data, V <: Data](config: Gemm
           stride := config_stride
 
           activation := config_activation
-          when (!config_acc_scale.asUInt().andR()) {
+          when (!config_acc_scale.asUInt.andR) {
             acc_scale := config_acc_scale.asTypeOf(acc_scale_t)
           }
 
@@ -248,7 +248,7 @@ class StoreController[T <: Data : Arithmetic, U <: Data, V <: Data](config: Gemm
           cmd.ready := true.B
         }
         .elsewhen(config.has_normalizations.B && DoConfigNorm) {
-          when (!config_set_stats_id_only.asBool()) {
+          when (!config_set_stats_id_only.asBool) {
             igelu_qb := config_igelu_qb.asTypeOf(igelu_qb)
             igelu_qc := config_igelu_qc.asTypeOf(igelu_qc)
             when(config_iexp_q_const_type === 0.U) {
diff --git a/src/main/scala/gemmini/TagQueue.scala b/src/main/scala/gemmini/TagQueue.scala
index 9a6464c3..f656119a 100644
--- a/src/main/scala/gemmini/TagQueue.scala
+++ b/src/main/scala/gemmini/TagQueue.scala
@@ -44,7 +44,7 @@ class TagQueue[T <: Data with TagQueueTag](t: T, entries: Int) extends Module {
     len := len - 1.U
   }
 
-  when (reset.asBool()) {
+  when (reset.asBool) {
     regs.foreach(_.make_this_garbage())
   }
 
diff --git a/src/main/scala/gemmini/TilerScheduler.scala b/src/main/scala/gemmini/TilerScheduler.scala
index c09ff949..d9fd8b6a 100644
--- a/src/main/scala/gemmini/TilerScheduler.scala
+++ b/src/main/scala/gemmini/TilerScheduler.scala
@@ -253,7 +253,7 @@ class TilerScheduler[T <: Data: Arithmetic, U <: Data, V <: Data]
                        Cat(older_in_same_q) |
                        Cat(is_st_and_must_wait_for_prior_ex_config) |
                        Cat(is_ex_config_and_must_wait_for_prior_st)
-                      ).asBools().reverse
+                      ).asBools.reverse
 
     new_entry.complete_on_issue := new_entry.is_config && new_entry.q =/= exq
 
@@ -440,7 +440,7 @@ class TilerScheduler[T <: Data: Arithmetic, U <: Data, V <: Data]
     printf(p"Last allocated: $last_allocated\n\n")
   }
 
-  when (reset.asBool()) {
+  when (reset.asBool) {
     entries.foreach(_.valid := false.B)
   }
 }
diff --git a/src/main/scala/gemmini/Util.scala b/src/main/scala/gemmini/Util.scala
index 51dc1377..07c36554 100644
--- a/src/main/scala/gemmini/Util.scala
+++ b/src/main/scala/gemmini/Util.scala
@@ -47,9 +47,9 @@ object Util {
   def sFloorAdd(s: SInt, n: UInt, max_plus_one: SInt, min: SInt, en: Bool = true.B): SInt = {
     val max = max_plus_one - 1.S
 
-    MuxCase(s + n.zext(), Seq(
+    MuxCase(s + n.zext, Seq(
       (!en) -> s,
-      ((s +& n.zext()) > max) -> min
+      ((s +& n.zext) > max) -> min
     ))
   }
 
@@ -66,22 +66,22 @@ object Util {
 
   def closestLowerPowerOf2(u: UInt): UInt = {
     // TODO figure out a more efficient way of doing this. Is this many muxes really necessary?
-    val exp = u.asBools().zipWithIndex.map { case (b, i) =>
+    val exp = u.asBools.zipWithIndex.map { case (b, i) =>
         Mux(b, i.U, 0.U)
     }.reduce((acc, u) => Mux(acc > u, acc, u))
 
-    (1.U << exp).asUInt()
+    (1.U << exp).asUInt
   }
 
   def closestAlignedLowerPowerOf2(u: UInt, addr: UInt, stride: UInt, rowBytes: Int): UInt = {
     val lgRowBytes = log2Ceil(rowBytes)
 
     // TODO figure out a more efficient way of doing this. Is this many muxes really necessary?
-    val exp = u.asBools().zipWithIndex.map { case (b, i) =>
+    val exp = u.asBools.zipWithIndex.map { case (b, i) =>
       Mux(b && addr(i + lgRowBytes - 1, 0) === 0.U && stride(i + lgRowBytes - 1, 0) === 0.U, i.U, 0.U)
     }.reduce((acc, u) => Mux(acc > u, acc, u))
 
-    (1.U << exp).asUInt()
+    (1.U << exp).asUInt
   }
 
   // This function will return "next" with a 0-cycle delay when the "enable" signal is high. It's like a queue with
diff --git a/src/main/scala/gemmini/XactTracker.scala b/src/main/scala/gemmini/XactTracker.scala
index 277626a1..afd8f964 100644
--- a/src/main/scala/gemmini/XactTracker.scala
+++ b/src/main/scala/gemmini/XactTracker.scala
@@ -84,7 +84,7 @@ class XactTracker[U <: Data](nXacts: Int, maxShift: Int, spadWidth: Int, accWidt
     assert(entries(io.peek.xactid).valid)
   }
 
-  when (reset.asBool()) {
+  when (reset.asBool) {
     entries.foreach(_.valid := false.B)
   }
 
diff --git a/src/main/scala/gemmini/ZeroWriter.scala b/src/main/scala/gemmini/ZeroWriter.scala
index a1834a41..17da09e1 100644
--- a/src/main/scala/gemmini/ZeroWriter.scala
+++ b/src/main/scala/gemmini/ZeroWriter.scala
@@ -70,7 +70,7 @@ class ZeroWriter[T <: Data, U <: Data, V <: Data, Tag <: Data](config: GemminiAr
     col_counter := 0.U
   }
 
-  when (reset.asBool()) {
+  when (reset.asBool) {
     req.pop()
   }
 }

From 7d418c78d822ca465e4b5b3e71b6d7cb19037239 Mon Sep 17 00:00:00 2001
From: Jerry Zhao <jerryz123@berkeley.edu>
Date: Wed, 4 Jan 2023 01:24:26 -0800
Subject: [PATCH 19/64] Bump to scala 2.13/new rocketchip

---
 build.sbt                               | 2 +-
 src/main/scala/gemmini/Controller.scala | 2 +-
 src/main/scala/gemmini/DMA.scala        | 9 ++++++---
 src/main/scala/gemmini/Scratchpad.scala | 4 ++--
 4 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/build.sbt b/build.sbt
index b34575a0..de8bd4e5 100644
--- a/build.sbt
+++ b/build.sbt
@@ -4,7 +4,7 @@ name := "gemmini"
 
 version := "3.1.0"
 
-scalaVersion := "2.12.10"
+scalaVersion := "2.13.10"
 
 libraryDependencies ++= Seq(
   "edu.berkeley.cs" %% "chisel3" % "3.4.+",
diff --git a/src/main/scala/gemmini/Controller.scala b/src/main/scala/gemmini/Controller.scala
index 2c15d3ea..d4c5f7d5 100644
--- a/src/main/scala/gemmini/Controller.scala
+++ b/src/main/scala/gemmini/Controller.scala
@@ -65,7 +65,7 @@ class GemminiModule[T <: Data: Arithmetic, U <: Data, V <: Data]
   counters.io.event_io.collect(spad.module.io.counter)
 
   // TLB
-  implicit val edge = outer.node.edges.out.head
+  implicit val edge = outer.spad.id_node.edges.out.head
   val tlb = Module(new FrontendTLB(2, tlb_size, dma_maxbytes, use_tlb_register_filter, use_firesim_simulation_counters, use_shared_tlb))
   (tlb.io.clients zip outer.spad.module.io.tlb).foreach(t => t._1 <> t._2)
 
diff --git a/src/main/scala/gemmini/DMA.scala b/src/main/scala/gemmini/DMA.scala
index 71148b67..12a7509f 100644
--- a/src/main/scala/gemmini/DMA.scala
+++ b/src/main/scala/gemmini/DMA.scala
@@ -58,7 +58,8 @@ class StreamReader[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T
   val core = LazyModule(new StreamReaderCore(config, nXacts, beatBits, maxBytes, spadWidth, accWidth, aligned_to, spad_rows, acc_rows, meshRows, use_tlb_register_filter, use_firesim_simulation_counters))
   val node = core.node
 
-  lazy val module = new LazyModuleImp(this) {
+  lazy val module = new Impl
+  class Impl extends LazyModuleImp(this) {
 
     val io = IO(new Bundle {
       val req = Flipped(Decoupled(new StreamReadRequest(spad_rows, acc_rows, config.mvin_scale_t_bits)))
@@ -134,7 +135,8 @@ class StreamReaderCore[T <: Data, U <: Data, V <: Data](config: GemminiArrayConf
 
   // TODO when we request data from multiple rows which are actually contiguous in main memory, we should merge them into fewer requests
 
-  lazy val module = new LazyModuleImp(this) with HasCoreParameters with MemoryOpConstants {
+  lazy val module = new Impl
+  class Impl extends LazyModuleImp(this) with HasCoreParameters with MemoryOpConstants {
     val (tl, edge) = node.out(0)
 
     val spadWidthBytes = spadWidth / 8
@@ -353,7 +355,8 @@ class StreamWriter[T <: Data: Arithmetic](nXacts: Int, beatBits: Int, maxBytes:
 
   require(isPow2(aligned_to))
 
-  lazy val module = new LazyModuleImp(this) with HasCoreParameters with MemoryOpConstants {
+  lazy val module = new Impl
+  class Impl extends LazyModuleImp(this) with HasCoreParameters with MemoryOpConstants {
     val (tl, edge) = node.out(0)
     val dataBytes = dataWidth / 8
     val beatBytes = beatBits / 8
diff --git a/src/main/scala/gemmini/Scratchpad.scala b/src/main/scala/gemmini/Scratchpad.scala
index 70c9140f..0d80efb9 100644
--- a/src/main/scala/gemmini/Scratchpad.scala
+++ b/src/main/scala/gemmini/Scratchpad.scala
@@ -201,8 +201,8 @@ class Scratchpad[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T,
   xbar_node := TLBuffer() := writer.node
   id_node := TLWidthWidget(config.dma_buswidth/8) := TLBuffer() := xbar_node
 
-  lazy val module = new LazyModuleImp(this) with HasCoreParameters {
-
+  lazy val module = new Impl
+  class Impl extends LazyModuleImp(this) with HasCoreParameters {
     val io = IO(new Bundle {
       // DMA ports
       val dma = new Bundle {

From 49494fcfce24798cd6da9afc7918135286e158d3 Mon Sep 17 00:00:00 2001
From: Hasan Genc <hngenc@berkeley.edu>
Date: Fri, 20 Jan 2023 21:41:02 -0800
Subject: [PATCH 20/64] remove repeated lines caused by merging branches

---
 src/main/scala/gemmini/LoopMatmul.scala | 51 -------------------------
 1 file changed, 51 deletions(-)

diff --git a/src/main/scala/gemmini/LoopMatmul.scala b/src/main/scala/gemmini/LoopMatmul.scala
index 92ec8fd5..a33155e9 100644
--- a/src/main/scala/gemmini/LoopMatmul.scala
+++ b/src/main/scala/gemmini/LoopMatmul.scala
@@ -601,57 +601,6 @@ class LoopMatmulStC(block_size: Int, coreMaxAddrBits: Int, iterator_bitwidth: In
   ln_mvout_cmd_rs2.local_addr.norm_cmd := ln_norm_cmd
   ln_mvout_cmd.rs2 := ln_mvout_cmd_rs2.asUInt
 
-  // Layernorm iterators and calculations
-  val ln_row = Reg(UInt(iterator_bitwidth.W))
-  val ln_cmd = Reg(UInt(iterator_bitwidth.W))
-  val ln_stat_id = Reg(UInt(iterator_bitwidth.W))
-
-  val NORM_STAT_IDS = 4 // TODO magic number
-
-  val ln_norm_cmds = VecInit(VecInit(NormCmd.SUM, NormCmd.MEAN), VecInit(NormCmd.VARIANCE, NormCmd.INV_STDDEV),
-    VecInit(NormCmd.RESET, NormCmd.RESET))
-
-  val sm_norm_cmds = VecInit(VecInit(NormCmd.MAX, NormCmd.MAX), VecInit(NormCmd.SUM_EXP, NormCmd.INV_SUM_EXP),
-    VecInit(NormCmd.RESET, NormCmd.RESET))
-
-  val ln_stat_ids = Mux(rows -& ln_row > NORM_STAT_IDS.U, NORM_STAT_IDS.U, rows -& ln_row)
-
-  val ln_r = ln_row +& ln_stat_id
-
-  val ln_sp_addr = acc_addr_start +& (i * req.max_j +& j) * block_size.U +& ln_r
-  val ln_norm_cmd = Mux(j +& max_blocks >= req.max_j,
-    Mux(req.act === Activation.LAYERNORM, ln_norm_cmds(ln_cmd)(1), sm_norm_cmds(ln_cmd)(1)),
-    Mux(req.act === Activation.LAYERNORM, ln_norm_cmds(ln_cmd)(0), sm_norm_cmds(ln_cmd)(0)))
-
-  // TODO we assume for now that full_C and layernorm aren't true at the same
-  val ln_dram_offset = ((i * req.dram_stride +& j) * block_size.U +& ln_r * req.dram_stride) * (input_w/8).U
-  val ln_dram_addr = req.dram_addr + LoopMatmul.castDramOffset(ln_dram_offset)
-
-  val ln_config_norm_rs1 = Wire(new GemminiISA.ConfigNormRs1)
-  ln_config_norm_rs1 := DontCare
-  ln_config_norm_rs1.set_stats_id_only := 1.U
-  ln_config_norm_rs1.cmd_type := CONFIG_NORM
-  ln_config_norm_rs1.norm_stats_id := ln_stat_id
-
-  val ln_config_norm = Wire(new RoCCCommand)
-  ln_config_norm := DontCare
-  ln_config_norm.inst.funct := CONFIG_CMD
-  ln_config_norm.rs1 := ln_config_norm_rs1.asUInt()
-  ln_config_norm.rs2 := DontCare
-
-  val ln_mvout_cmd = Wire(new RoCCCommand)
-  ln_mvout_cmd := DontCare
-  ln_mvout_cmd.inst.funct := STORE_CMD
-  ln_mvout_cmd.rs1 := ln_dram_addr
-
-  val ln_mvout_cmd_rs2 = Wire(mvout_rs2_t.cloneType)
-  ln_mvout_cmd_rs2 := DontCare
-  ln_mvout_cmd_rs2.num_rows := 1.U
-  ln_mvout_cmd_rs2.num_cols := cols.asUInt()
-  ln_mvout_cmd_rs2.local_addr := cast_to_acc_addr(ln_mvout_cmd_rs2.local_addr, ln_sp_addr, accumulate = false.B, read_full = req.full_c)
-  ln_mvout_cmd_rs2.local_addr.norm_cmd := ln_norm_cmd
-  ln_mvout_cmd.rs2 := ln_mvout_cmd_rs2.asUInt()
-
   io.req.ready := state === idle
   io.j := j
   io.i := i

From ff8bda845796ee663983b832410df6f744bffaa0 Mon Sep 17 00:00:00 2001
From: SingularityKChen <chency_singularity@163.com>
Date: Sat, 21 Jan 2023 14:33:59 +0800
Subject: [PATCH 21/64] fix: Remove parenthesized forms of asUInt() (#273)

---
 src/main/scala/gemmini/AccumulatorScale.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/main/scala/gemmini/AccumulatorScale.scala b/src/main/scala/gemmini/AccumulatorScale.scala
index bf98a144..e4aaa192 100644
--- a/src/main/scala/gemmini/AccumulatorScale.scala
+++ b/src/main/scala/gemmini/AccumulatorScale.scala
@@ -160,7 +160,7 @@ class AccumulatorScale[T <: Data, U <: Data](
           regs(i).valid := false.B
         }
       }
-      head_oh := (head_oh << 1).asUInt() | head_oh(nEntries-1)
+      head_oh := (head_oh << 1).asUInt | head_oh(nEntries-1)
     }
 
     io.in.ready := !Mux1H(tail_oh.asBools, regs.map(_.valid)) || (tail_oh === head_oh && out.fire)

From 0a4517d789cf80224ff228fd314e531b8b6b3349 Mon Sep 17 00:00:00 2001
From: Hasan Genc <hngenc@berkeley.edu>
Date: Tue, 31 Jan 2023 15:57:32 -0800
Subject: [PATCH 22/64] bump chipyard to a version which compiles with 2.13

---
 CHIPYARD.hash | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHIPYARD.hash b/CHIPYARD.hash
index 7fb91902..654fff8f 100644
--- a/CHIPYARD.hash
+++ b/CHIPYARD.hash
@@ -1 +1 @@
-004297b6a8c01be1b2110c4cf4f9393ae1ff8805
+c8867beceece64dd682cc603e0ccfa0eb5b1d2a6

From 8bf01ea043b47305014143cb1a8ce449044baa64 Mon Sep 17 00:00:00 2001
From: Jerry Zhao <jerryz123@berkeley.edu>
Date: Tue, 31 Jan 2023 16:44:40 -0800
Subject: [PATCH 23/64] Add -f flag to chipyard CI

---
 .github/scripts/install-gemmini.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/scripts/install-gemmini.sh b/.github/scripts/install-gemmini.sh
index 0fa6460d..eaaeb494 100755
--- a/.github/scripts/install-gemmini.sh
+++ b/.github/scripts/install-gemmini.sh
@@ -23,7 +23,7 @@ cd $LOCAL_CHIPYARD_DIR
 git fetch
 git checkout $(cat $LOCAL_CHECKOUT_DIR/CHIPYARD.hash)
 
-./build-setup.sh esp-tools
+./build-setup.sh esp-tools -f
 
 source env.sh
 

From 50429a57f3919934606c158a04aa282e82a8cdca Mon Sep 17 00:00:00 2001
From: Jerry Zhao <jerryz123@berkeley.edu>
Date: Tue, 31 Jan 2023 17:15:51 -0800
Subject: [PATCH 24/64] Skip unncessary parts of chipyard install

---
 .github/scripts/install-gemmini.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/scripts/install-gemmini.sh b/.github/scripts/install-gemmini.sh
index eaaeb494..dba6d32d 100755
--- a/.github/scripts/install-gemmini.sh
+++ b/.github/scripts/install-gemmini.sh
@@ -23,7 +23,7 @@ cd $LOCAL_CHIPYARD_DIR
 git fetch
 git checkout $(cat $LOCAL_CHECKOUT_DIR/CHIPYARD.hash)
 
-./build-setup.sh esp-tools -f
+./build-setup.sh esp-tools -f -s 6 -s 7 -s 8 -s 9
 
 source env.sh
 

From 0c1be72547e87a0bd1e950113d5bd58812f8520b Mon Sep 17 00:00:00 2001
From: Jerry Zhao <jerryz123@berkeley.edu>
Date: Wed, 1 Feb 2023 00:28:38 -0800
Subject: [PATCH 25/64] Bump gemmini rocc tests

---
 software/gemmini-rocc-tests | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/software/gemmini-rocc-tests b/software/gemmini-rocc-tests
index ae0cd823..211e95f9 160000
--- a/software/gemmini-rocc-tests
+++ b/software/gemmini-rocc-tests
@@ -1 +1 @@
-Subproject commit ae0cd8236d32fccf7197a7ac0634df5513cec4db
+Subproject commit 211e95f961d174788c028cc42a75379585b16c63

From b8848757e0ae67db2629c8ec4bb025cbb0a75319 Mon Sep 17 00:00:00 2001
From: Jerry Zhao <jerryz123@berkeley.edu>
Date: Wed, 1 Feb 2023 09:38:40 -0800
Subject: [PATCH 26/64] Only install esp-isa-sim for spike-tests

---
 .github/scripts/install-gemmini.sh | 4 ----
 .github/scripts/run-tests-spike.sh | 4 ++++
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/scripts/install-gemmini.sh b/.github/scripts/install-gemmini.sh
index dba6d32d..ec3c4f75 100755
--- a/.github/scripts/install-gemmini.sh
+++ b/.github/scripts/install-gemmini.sh
@@ -27,10 +27,6 @@ git checkout $(cat $LOCAL_CHECKOUT_DIR/CHIPYARD.hash)
 
 source env.sh
 
-cd toolchains/esp-tools/riscv-isa-sim/build
-git checkout $(cat $LOCAL_CHECKOUT_DIR/SPIKE.hash)
-make && make install
-
 cd $LOCAL_CHECKOUT_DIR
 chown -R $(whoami) .
 git config --global --add safe.directory $LOCAL_CHECKOUT_DIR
diff --git a/.github/scripts/run-tests-spike.sh b/.github/scripts/run-tests-spike.sh
index 93288a75..cefe5a8e 100755
--- a/.github/scripts/run-tests-spike.sh
+++ b/.github/scripts/run-tests-spike.sh
@@ -10,6 +10,10 @@ source $SCRIPT_DIR/enable-conda.sh
 cd $LOCAL_CHIPYARD_DIR
 source env.sh
 
+cd $LOCAL_CHIPYARD_DIR/toolchains/esp-tools/riscv-isa-sim/build
+git checkout $(cat $LOCAL_CHECKOUT_DIR/SPIKE.hash)
+make && make install
+
 cd $LOCAL_CHIPYARD_DIR/generators/gemmini/software/gemmini-rocc-tests
 ./build.sh
 

From 0529232d879c27f4bc7942a1ccdb0c71dd1999f2 Mon Sep 17 00:00:00 2001
From: Jerry Zhao <jerryz123@berkeley.edu>
Date: Wed, 1 Feb 2023 09:51:48 -0800
Subject: [PATCH 27/64] Bump gemmini-rocc-tests

---
 .github/scripts/install-gemmini.sh | 2 +-
 software/gemmini-rocc-tests        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/scripts/install-gemmini.sh b/.github/scripts/install-gemmini.sh
index ec3c4f75..8a89a106 100755
--- a/.github/scripts/install-gemmini.sh
+++ b/.github/scripts/install-gemmini.sh
@@ -23,7 +23,7 @@ cd $LOCAL_CHIPYARD_DIR
 git fetch
 git checkout $(cat $LOCAL_CHECKOUT_DIR/CHIPYARD.hash)
 
-./build-setup.sh esp-tools -f -s 6 -s 7 -s 8 -s 9
+./build-setup.sh riscv-tools -f -s 6 -s 7 -s 8 -s 9
 
 source env.sh
 
diff --git a/software/gemmini-rocc-tests b/software/gemmini-rocc-tests
index 211e95f9..61dbeb6a 160000
--- a/software/gemmini-rocc-tests
+++ b/software/gemmini-rocc-tests
@@ -1 +1 @@
-Subproject commit 211e95f961d174788c028cc42a75379585b16c63
+Subproject commit 61dbeb6a92cf8386e5171da2d97bfe1ad5587151

From 534f8fc2569bb6e0aeb841abb76ae0d97e1eb3bf Mon Sep 17 00:00:00 2001
From: Jerry Zhao <jerryz123@berkeley.edu>
Date: Wed, 1 Feb 2023 13:40:11 -0800
Subject: [PATCH 28/64] Fix spike build in CI

---
 .github/scripts/run-tests-spike.sh | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/.github/scripts/run-tests-spike.sh b/.github/scripts/run-tests-spike.sh
index cefe5a8e..c21274f2 100755
--- a/.github/scripts/run-tests-spike.sh
+++ b/.github/scripts/run-tests-spike.sh
@@ -10,8 +10,12 @@ source $SCRIPT_DIR/enable-conda.sh
 cd $LOCAL_CHIPYARD_DIR
 source env.sh
 
-cd $LOCAL_CHIPYARD_DIR/toolchains/esp-tools/riscv-isa-sim/build
+git clone git@github.com:ucb-bar/esp-isa-sim.git
+cd esp-isa-sim
 git checkout $(cat $LOCAL_CHECKOUT_DIR/SPIKE.hash)
+mkdir build
+cd build
+../configure --prefix=$RISCV
 make && make install
 
 cd $LOCAL_CHIPYARD_DIR/generators/gemmini/software/gemmini-rocc-tests

From 17a03981dd71155a5d5d22d5d62b35178b2b840c Mon Sep 17 00:00:00 2001
From: Jerry Zhao <jerryz123@berkeley.edu>
Date: Thu, 2 Feb 2023 14:44:07 -0800
Subject: [PATCH 29/64] Switch to https spike clone in CI

---
 .github/scripts/run-tests-spike.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/scripts/run-tests-spike.sh b/.github/scripts/run-tests-spike.sh
index c21274f2..271b63f3 100755
--- a/.github/scripts/run-tests-spike.sh
+++ b/.github/scripts/run-tests-spike.sh
@@ -10,7 +10,7 @@ source $SCRIPT_DIR/enable-conda.sh
 cd $LOCAL_CHIPYARD_DIR
 source env.sh
 
-git clone git@github.com:ucb-bar/esp-isa-sim.git
+git clone https://github.com/ucb-bar/esp-isa-sim.git
 cd esp-isa-sim
 git checkout $(cat $LOCAL_CHECKOUT_DIR/SPIKE.hash)
 mkdir build

From aece1a6078e8f93cbbcc7f55a9b9f88e47df1c0b Mon Sep 17 00:00:00 2001
From: Jerry Zhao <jerryz123@berkeley.edu>
Date: Fri, 3 Feb 2023 16:35:11 -0800
Subject: [PATCH 30/64] Tie off debug in CI tests

---
 .github/scripts/do-rtl-build.sh  | 2 +-
 .github/scripts/run-tests-rtl.sh | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/scripts/do-rtl-build.sh b/.github/scripts/do-rtl-build.sh
index 38651571..36cbbcbf 100755
--- a/.github/scripts/do-rtl-build.sh
+++ b/.github/scripts/do-rtl-build.sh
@@ -14,5 +14,5 @@ source env.sh
 
 cd $LOCAL_SIM_DIR
 make -C $LOCAL_SIM_DIR clean
-make -j$LOCAL_MAKE_NPROC -C $LOCAL_SIM_DIR VERILATOR_OPT_FLAGS="-O0 -OG" JAVA_OPTS="-Xmx2500M -Xss8M" SBT_OPTS="-Dsbt.ivy.home=$LOCAL_CHIPYARD_DIR/.ivy2 -Dsbt.supershell=false -Dsbt.global.base=$LOCAL_CHIPYARD_DIR/.sbt -Dsbt.boot.directory=$LOCAL_CHIPYARD_DIR/.sbt/boot" CONFIG=GemminiRocketConfig
+make -j$LOCAL_MAKE_NPROC -C $LOCAL_SIM_DIR VERILATOR_OPT_FLAGS="-O0 -OG" JAVA_OPTS="-Xmx2500M -Xss8M" SBT_OPTS="-Dsbt.ivy.home=$LOCAL_CHIPYARD_DIR/.ivy2 -Dsbt.supershell=false -Dsbt.global.base=$LOCAL_CHIPYARD_DIR/.sbt -Dsbt.boot.directory=$LOCAL_CHIPYARD_DIR/.sbt/boot" CONFIG=chipyard.harness.WithTiedOffDebug_GemminiRocketConfig
 
diff --git a/.github/scripts/run-tests-rtl.sh b/.github/scripts/run-tests-rtl.sh
index 47a87ff1..14bf137b 100755
--- a/.github/scripts/run-tests-rtl.sh
+++ b/.github/scripts/run-tests-rtl.sh
@@ -14,5 +14,5 @@ cd $LOCAL_CHIPYARD_DIR/generators/gemmini/software/gemmini-rocc-tests
 CFLAGS=-DFAST ./build.sh
 
 cd build
-make test-baremetal-bareMetalC RUNNER="'make -C $LOCAL_CHIPYARD_DIR/sims/verilator/ CONFIG=GemminiRocketConfig run-binary-hex BINARY='"
+make test-baremetal-bareMetalC RUNNER="'make -C $LOCAL_CHIPYARD_DIR/sims/verilator/ CONFIG=chipyard.harness.WithTiedOffDebug_GemminiRocketConfig run-binary-hex BINARY='"
 

From 3887f5181c91369aa2e1a3aeddbb12858bf2ccc6 Mon Sep 17 00:00:00 2001
From: Jerry Zhao <jerryz123@berkeley.edu>
Date: Sat, 4 Feb 2023 11:21:11 -0800
Subject: [PATCH 31/64] Remove debug module entirely from CI

---
 .github/scripts/defaults.sh      | 2 ++
 .github/scripts/do-rtl-build.sh  | 2 +-
 .github/scripts/run-tests-rtl.sh | 2 +-
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/.github/scripts/defaults.sh b/.github/scripts/defaults.sh
index e403fc89..b49ae66e 100755
--- a/.github/scripts/defaults.sh
+++ b/.github/scripts/defaults.sh
@@ -30,6 +30,8 @@ LOCAL_SIM_DIR=$LOCAL_CHIPYARD_DIR/sims/verilator
 LOCAL_VERILATOR_DIR=$HOME/verilator-install
 LOCAL_CONDA=/opt/conda/
 
+CICONFIG=chipyard.config.WithNoDebug_GemminiRocketConfig
+
 echo "::set-output name=LOCAL_WORK_DIR::$LOCAL_WORK_DIR"
 echo "::set-output name=LOCAL_CHECKOUT_DIR::$LOCAL_CHECKOUT_DIR"
 echo "::set-output name=LOCAL_RISCV_DIR::$LOCAL_RISCV_DIR"
diff --git a/.github/scripts/do-rtl-build.sh b/.github/scripts/do-rtl-build.sh
index 36cbbcbf..3e26a04f 100755
--- a/.github/scripts/do-rtl-build.sh
+++ b/.github/scripts/do-rtl-build.sh
@@ -14,5 +14,5 @@ source env.sh
 
 cd $LOCAL_SIM_DIR
 make -C $LOCAL_SIM_DIR clean
-make -j$LOCAL_MAKE_NPROC -C $LOCAL_SIM_DIR VERILATOR_OPT_FLAGS="-O0 -OG" JAVA_OPTS="-Xmx2500M -Xss8M" SBT_OPTS="-Dsbt.ivy.home=$LOCAL_CHIPYARD_DIR/.ivy2 -Dsbt.supershell=false -Dsbt.global.base=$LOCAL_CHIPYARD_DIR/.sbt -Dsbt.boot.directory=$LOCAL_CHIPYARD_DIR/.sbt/boot" CONFIG=chipyard.harness.WithTiedOffDebug_GemminiRocketConfig
+make -j$LOCAL_MAKE_NPROC -C $LOCAL_SIM_DIR VERILATOR_OPT_FLAGS="-O0 -OG" JAVA_OPTS="-Xmx2500M -Xss8M" SBT_OPTS="-Dsbt.ivy.home=$LOCAL_CHIPYARD_DIR/.ivy2 -Dsbt.supershell=false -Dsbt.global.base=$LOCAL_CHIPYARD_DIR/.sbt -Dsbt.boot.directory=$LOCAL_CHIPYARD_DIR/.sbt/boot" CONFIG=$CICONFIG
 
diff --git a/.github/scripts/run-tests-rtl.sh b/.github/scripts/run-tests-rtl.sh
index 14bf137b..e179bd37 100755
--- a/.github/scripts/run-tests-rtl.sh
+++ b/.github/scripts/run-tests-rtl.sh
@@ -14,5 +14,5 @@ cd $LOCAL_CHIPYARD_DIR/generators/gemmini/software/gemmini-rocc-tests
 CFLAGS=-DFAST ./build.sh
 
 cd build
-make test-baremetal-bareMetalC RUNNER="'make -C $LOCAL_CHIPYARD_DIR/sims/verilator/ CONFIG=chipyard.harness.WithTiedOffDebug_GemminiRocketConfig run-binary-hex BINARY='"
+make test-baremetal-bareMetalC RUNNER="'make -C $LOCAL_CHIPYARD_DIR/sims/verilator/ CONFIG=$CICONFIG run-binary-hex BINARY='"
 

From ab5d181b91c21ccf6727150070fe6a0cbcbefa6e Mon Sep 17 00:00:00 2001
From: Jerry Zhao <jerryz123@berkeley.edu>
Date: Sun, 5 Feb 2023 09:19:16 -0800
Subject: [PATCH 32/64] Add libgemmini submodule

---
 .gitmodules         | 3 +++
 software/libgemmini | 1 +
 2 files changed, 4 insertions(+)
 create mode 160000 software/libgemmini

diff --git a/.gitmodules b/.gitmodules
index 19df446b..fbd4220e 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -4,3 +4,6 @@
 [submodule "software/onnxruntime-riscv"]
 	path = software/onnxruntime-riscv
 	url = https://github.com/pranav-prakash/onnxruntime-riscv.git
+[submodule "software/libgemmini"]
+	path = software/libgemmini
+	url = https://github.com/ucb-bar/libgemmini.git
diff --git a/software/libgemmini b/software/libgemmini
new file mode 160000
index 00000000..144b975a
--- /dev/null
+++ b/software/libgemmini
@@ -0,0 +1 @@
+Subproject commit 144b975a14646b22d23696aadd0b543423182a21

From c1a8807155e19bf3b062e3893d379c0d9b6131f9 Mon Sep 17 00:00:00 2001
From: Jerry Zhao <jerryz123@berkeley.edu>
Date: Sun, 5 Feb 2023 09:20:39 -0800
Subject: [PATCH 33/64] Bump chipyard

---
 CHIPYARD.hash | 2 +-
 SPIKE.hash    | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)
 delete mode 100644 SPIKE.hash

diff --git a/CHIPYARD.hash b/CHIPYARD.hash
index 654fff8f..0e76f9a8 100644
--- a/CHIPYARD.hash
+++ b/CHIPYARD.hash
@@ -1 +1 @@
-c8867beceece64dd682cc603e0ccfa0eb5b1d2a6
+30019f3fd739c49206afafea84b316312832efd8
diff --git a/SPIKE.hash b/SPIKE.hash
deleted file mode 100644
index 8cbb8d37..00000000
--- a/SPIKE.hash
+++ /dev/null
@@ -1 +0,0 @@
-051d820f08be84d069993de4375d29c91eb2f577

From b6389f3ea7bbf070aa3dd40972daa5be7e2d4261 Mon Sep 17 00:00:00 2001
From: Jerry Zhao <jerryz123@berkeley.edu>
Date: Sun, 5 Feb 2023 09:24:18 -0800
Subject: [PATCH 34/64] Remove esp-isa-sim install from CI

---
 .github/scripts/run-tests-spike.sh | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/.github/scripts/run-tests-spike.sh b/.github/scripts/run-tests-spike.sh
index 271b63f3..93288a75 100755
--- a/.github/scripts/run-tests-spike.sh
+++ b/.github/scripts/run-tests-spike.sh
@@ -10,14 +10,6 @@ source $SCRIPT_DIR/enable-conda.sh
 cd $LOCAL_CHIPYARD_DIR
 source env.sh
 
-git clone https://github.com/ucb-bar/esp-isa-sim.git
-cd esp-isa-sim
-git checkout $(cat $LOCAL_CHECKOUT_DIR/SPIKE.hash)
-mkdir build
-cd build
-../configure --prefix=$RISCV
-make && make install
-
 cd $LOCAL_CHIPYARD_DIR/generators/gemmini/software/gemmini-rocc-tests
 ./build.sh
 

From 8b2661100e047402edf0b870bd969acf6e6348a4 Mon Sep 17 00:00:00 2001
From: Jerry Zhao <jerryz123@berkeley.edu>
Date: Sun, 5 Feb 2023 10:01:02 -0800
Subject: [PATCH 35/64] Bump chipyard

---
 CHIPYARD.hash | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHIPYARD.hash b/CHIPYARD.hash
index 0e76f9a8..84851613 100644
--- a/CHIPYARD.hash
+++ b/CHIPYARD.hash
@@ -1 +1 @@
-30019f3fd739c49206afafea84b316312832efd8
+e5a734e20a2f4f54db14a190bc0de5efe51f15b2

From b8dcc3ae0b576ec2a17b33254ac181b6bc2f8345 Mon Sep 17 00:00:00 2001
From: Jerry Zhao <jerryz123@berkeley.edu>
Date: Sun, 5 Feb 2023 10:01:14 -0800
Subject: [PATCH 36/64] Install our own libgemmini

---
 .github/scripts/install-gemmini.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/scripts/install-gemmini.sh b/.github/scripts/install-gemmini.sh
index 8a89a106..7d4c4ad3 100755
--- a/.github/scripts/install-gemmini.sh
+++ b/.github/scripts/install-gemmini.sh
@@ -27,6 +27,8 @@ git checkout $(cat $LOCAL_CHECKOUT_DIR/CHIPYARD.hash)
 
 source env.sh
 
+make -C $LOCAL_CHECKOUT_DIR/software/libgemmini
+
 cd $LOCAL_CHECKOUT_DIR
 chown -R $(whoami) .
 git config --global --add safe.directory $LOCAL_CHECKOUT_DIR

From 24079deb6b30eea8941ba511da00d70a2d2fe540 Mon Sep 17 00:00:00 2001
From: Jerry Zhao <jerryz123@berkeley.edu>
Date: Sun, 5 Feb 2023 10:36:42 -0800
Subject: [PATCH 37/64] Bump chipyard

---
 CHIPYARD.hash | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHIPYARD.hash b/CHIPYARD.hash
index 84851613..747b6b18 100644
--- a/CHIPYARD.hash
+++ b/CHIPYARD.hash
@@ -1 +1 @@
-e5a734e20a2f4f54db14a190bc0de5efe51f15b2
+bcbe3b7f1f40d1c388aca68df498fd7dd4d16e89

From 75c534aa2586aa9d2bb62edaf2e1fcf9bd85135b Mon Sep 17 00:00:00 2001
From: Jerry Zhao <jerryz123@berkeley.edu>
Date: Sun, 5 Feb 2023 11:06:59 -0800
Subject: [PATCH 38/64] Fix install-gemmini script

---
 .github/scripts/install-gemmini.sh | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.github/scripts/install-gemmini.sh b/.github/scripts/install-gemmini.sh
index 7d4c4ad3..dac96eea 100755
--- a/.github/scripts/install-gemmini.sh
+++ b/.github/scripts/install-gemmini.sh
@@ -27,7 +27,9 @@ git checkout $(cat $LOCAL_CHECKOUT_DIR/CHIPYARD.hash)
 
 source env.sh
 
-make -C $LOCAL_CHECKOUT_DIR/software/libgemmini
+cd $LOCAL_CHECKOUT_DIR
+git submodule update --init software/libgemmini
+make -C software/libgemmini
 
 cd $LOCAL_CHECKOUT_DIR
 chown -R $(whoami) .

From 89a5c8f84c8c2fcd233627ba5c2653222562d620 Mon Sep 17 00:00:00 2001
From: Jerry Zhao <jerryz123@berkeley.edu>
Date: Sun, 5 Feb 2023 11:46:43 -0800
Subject: [PATCH 39/64] Fix git dubious ownership warning

---
 .github/scripts/install-gemmini.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/scripts/install-gemmini.sh b/.github/scripts/install-gemmini.sh
index dac96eea..37260e05 100755
--- a/.github/scripts/install-gemmini.sh
+++ b/.github/scripts/install-gemmini.sh
@@ -28,6 +28,7 @@ git checkout $(cat $LOCAL_CHECKOUT_DIR/CHIPYARD.hash)
 source env.sh
 
 cd $LOCAL_CHECKOUT_DIR
+git config --global --add safe.directory $LOCAL_CHECKOUT_DIR
 git submodule update --init software/libgemmini
 make -C software/libgemmini
 

From fbe4ec19911e722bea4077f3f3c761ef405820ab Mon Sep 17 00:00:00 2001
From: Hasan Genc <hngenc@berkeley.edu>
Date: Sun, 5 Feb 2023 13:54:42 -0800
Subject: [PATCH 40/64] try to fix safe-directory errors

---
 .github/scripts/install-gemmini.sh | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/.github/scripts/install-gemmini.sh b/.github/scripts/install-gemmini.sh
index 37260e05..b98e6250 100755
--- a/.github/scripts/install-gemmini.sh
+++ b/.github/scripts/install-gemmini.sh
@@ -28,13 +28,15 @@ git checkout $(cat $LOCAL_CHECKOUT_DIR/CHIPYARD.hash)
 source env.sh
 
 cd $LOCAL_CHECKOUT_DIR
+chown -R $(whoami) .
 git config --global --add safe.directory $LOCAL_CHECKOUT_DIR
+git config --global --add safe.directory '*'
+
+cd $LOCAL_CHECKOUT_DIR
 git submodule update --init software/libgemmini
 make -C software/libgemmini
 
 cd $LOCAL_CHECKOUT_DIR
-chown -R $(whoami) .
-git config --global --add safe.directory $LOCAL_CHECKOUT_DIR
 git submodule update --init --recursive software/gemmini-rocc-tests
 rm -rf $LOCAL_CHIPYARD_DIR/generators/gemmini/* $LOCAL_CHIPYARD_DIR/generators/gemmini/.git*
 mv -f $LOCAL_CHECKOUT_DIR/* $LOCAL_CHECKOUT_DIR/.git* $LOCAL_CHIPYARD_DIR/generators/gemmini/

From 43a2ebcaf08e555171185f70a0ac8511eebdcba3 Mon Sep 17 00:00:00 2001
From: Jerry Zhao <jerryz123@berkeley.edu>
Date: Sun, 5 Feb 2023 20:32:06 -0800
Subject: [PATCH 41/64] Bump libgemmini

---
 software/libgemmini | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/software/libgemmini b/software/libgemmini
index 144b975a..4be22079 160000
--- a/software/libgemmini
+++ b/software/libgemmini
@@ -1 +1 @@
-Subproject commit 144b975a14646b22d23696aadd0b543423182a21
+Subproject commit 4be220794cfdb834e8ecc2ee7becdf8632cc268c

From 8c33285c5a15aa60929e300b7ec766f4ef799c4e Mon Sep 17 00:00:00 2001
From: Jerry Zhao <jerryz123@berkeley.edu>
Date: Sun, 5 Feb 2023 22:41:22 -0800
Subject: [PATCH 42/64] Delete stale libgemmini before installing in CI

---
 .github/scripts/install-gemmini.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/scripts/install-gemmini.sh b/.github/scripts/install-gemmini.sh
index b98e6250..1acab7b9 100755
--- a/.github/scripts/install-gemmini.sh
+++ b/.github/scripts/install-gemmini.sh
@@ -33,6 +33,8 @@ git config --global --add safe.directory $LOCAL_CHECKOUT_DIR
 git config --global --add safe.directory '*'
 
 cd $LOCAL_CHECKOUT_DIR
+# Delete the stale libgemmini first installed by chipyard, switch to the one submoduled here
+rm -rf $RISCV/lib/libgemmini.so
 git submodule update --init software/libgemmini
 make -C software/libgemmini
 

From 6dd1ae9ddd084acc2506f14f0510e3244f25b30d Mon Sep 17 00:00:00 2001
From: Jerry Zhao <jerryz123@berkeley.edu>
Date: Sun, 5 Feb 2023 23:37:15 -0800
Subject: [PATCH 43/64] install libgemmini in run-tests-spike for ci

---
 .github/scripts/install-gemmini.sh |  6 ------
 .github/scripts/run-tests-spike.sh | 11 +++++++++++
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/.github/scripts/install-gemmini.sh b/.github/scripts/install-gemmini.sh
index 1acab7b9..daf2d891 100755
--- a/.github/scripts/install-gemmini.sh
+++ b/.github/scripts/install-gemmini.sh
@@ -32,12 +32,6 @@ chown -R $(whoami) .
 git config --global --add safe.directory $LOCAL_CHECKOUT_DIR
 git config --global --add safe.directory '*'
 
-cd $LOCAL_CHECKOUT_DIR
-# Delete the stale libgemmini first installed by chipyard, switch to the one submoduled here
-rm -rf $RISCV/lib/libgemmini.so
-git submodule update --init software/libgemmini
-make -C software/libgemmini
-
 cd $LOCAL_CHECKOUT_DIR
 git submodule update --init --recursive software/gemmini-rocc-tests
 rm -rf $LOCAL_CHIPYARD_DIR/generators/gemmini/* $LOCAL_CHIPYARD_DIR/generators/gemmini/.git*
diff --git a/.github/scripts/run-tests-spike.sh b/.github/scripts/run-tests-spike.sh
index 93288a75..9a2d8ede 100755
--- a/.github/scripts/run-tests-spike.sh
+++ b/.github/scripts/run-tests-spike.sh
@@ -10,6 +10,17 @@ source $SCRIPT_DIR/enable-conda.sh
 cd $LOCAL_CHIPYARD_DIR
 source env.sh
 
+cd $LOCAL_CHECKOUT_DIR
+chown -R $(whoami) .
+git config --global --add safe.directory $LOCAL_CHECKOUT_DIR
+git config --global --add safe.directory '*'
+
+cd $LOCAL_CHECKOUT_DIR
+# Delete the stale libgemmini first installed by chipyard, switch to the one submoduled here
+rm -rf $RISCV/lib/libgemmini.so
+git submodule update --init software/libgemmini
+make -C software/libgemmini
+
 cd $LOCAL_CHIPYARD_DIR/generators/gemmini/software/gemmini-rocc-tests
 ./build.sh
 

From 1d66652bb400f61bd2a1f6cdc08ab8a73dcd32c2 Mon Sep 17 00:00:00 2001
From: Jerry Zhao <jerryz123@berkeley.edu>
Date: Mon, 6 Feb 2023 09:41:04 -0800
Subject: [PATCH 44/64] Actually install our libgemmini in CI

---
 .github/scripts/run-tests-spike.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/scripts/run-tests-spike.sh b/.github/scripts/run-tests-spike.sh
index 9a2d8ede..41774cfd 100755
--- a/.github/scripts/run-tests-spike.sh
+++ b/.github/scripts/run-tests-spike.sh
@@ -19,7 +19,7 @@ cd $LOCAL_CHECKOUT_DIR
 # Delete the stale libgemmini first installed by chipyard, switch to the one submoduled here
 rm -rf $RISCV/lib/libgemmini.so
 git submodule update --init software/libgemmini
-make -C software/libgemmini
+make -C software/libgemmini install
 
 cd $LOCAL_CHIPYARD_DIR/generators/gemmini/software/gemmini-rocc-tests
 ./build.sh

From 7c8657cf20aad1f76f4dc1b216dc35b9156c6768 Mon Sep 17 00:00:00 2001
From: Jerry Zhao <jerryz123@berkeley.edu>
Date: Tue, 7 Feb 2023 09:43:03 -0800
Subject: [PATCH 45/64] [ci skip] Update README/scripts with new isa-sim
 instructions

---
 README.md              | 14 ++++----------
 scripts/build-spike.sh |  6 +++---
 2 files changed, 7 insertions(+), 13 deletions(-)

diff --git a/README.md b/README.md
index 5f310564..4caf35bb 100644
--- a/README.md
+++ b/README.md
@@ -33,7 +33,7 @@ Run these steps to install Chipyard and Spike (make sure to checkout the correct
 git clone https://github.com/ucb-bar/chipyard.git
 cd chipyard
 git checkout 1.8.1
-./build-setup.sh esp-tools
+./build-setup.sh riscv-tools
 
 source env.sh
 
@@ -42,12 +42,7 @@ git config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*"
 git checkout dev && git pull origin dev
 git submodule update --init --recursive
 
-SPIKE_HASH=$(cat SPIKE.hash)
-
-cd -
-cd toolchains/esp-tools/riscv-isa-sim/build
-git fetch && git checkout $SPIKE_HASH
-make && make install
+make -C software/libgemmini install
 
 # The final step is only necessary if you want to run MIDAS simulations with
 # realistic DRAM models
@@ -368,9 +363,8 @@ Afterwards, the test binaries will be found in `software/gemmini-rocc-tests/buil
 Binaries whose names end in `-baremetal` are meant to be run in a bare-metal environment, while binaries whose names end in `-linux` are meant to run in a Linux environment.
 You can run the tests either on a cycle-accurate RTL simulator, or on a (much faster) functional ISA simulator called Spike.
 
-We use a special fork of Spike, found [here](https://github.com/ucb-bar/esp-isa-sim), which has support for Gemmini instructions.
-(You can find the required commit hash in `SPIKE.hash`).
-If you are using Chipyard, you can easily build Spike by running `./scripts/build-toolchains.sh esp-tools` from Chipyard's root directory.
+We use a special extension of Spike, found [here](https://github.com/ucb-bar/libgemmini), which has support for Gemmini instructions.
+If you are using Chipyard, you can easily build Spike by running `./scripts/build-toolchains.sh riscv-tools` from Chipyard's root directory, then by running `make -C software/libgemmini install` in the Gemmini directory.
 Then, to run the `mvin_mvout` test, which simply moves a matrix into Gemmini's scratchpad before moving it back out into main memory, run the following commands:
 
 ```shell
diff --git a/scripts/build-spike.sh b/scripts/build-spike.sh
index f7b1baf2..1df963ab 100755
--- a/scripts/build-spike.sh
+++ b/scripts/build-spike.sh
@@ -24,6 +24,6 @@ echo Generating new gemmini_params.h file...
 make verilog CONFIG=CustomGemminiSoCConfig &> build.log
 
 cd -
-cp software/gemmini-rocc-tests/include/gemmini_params.h ../../toolchains/esp-tools/riscv-isa-sim/gemmini/gemmini_params.h
-cd ../../toolchains/esp-tools/riscv-isa-sim/build
-make && make install
+cp software/gemmini-rocc-tests/include/gemmini_params.h software/libgemmini/gemmini_params.h
+make -C software/libgemmini clean
+make -C software/libgemmini install

From 074a880df20a2a3bb0b95ddb707aae8dd06ed324 Mon Sep 17 00:00:00 2001
From: Jamie Hong <27294970+curiousdragon@users.noreply.github.com>
Date: Sat, 11 Feb 2023 08:55:02 -0800
Subject: [PATCH 46/64] Update gemmini-rocc-tests to support rectangular
 convolutions (#278)

---------

Co-authored-by: Hasan Genc <hngenc@berkeley.edu>
---
 software/gemmini-rocc-tests | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/software/gemmini-rocc-tests b/software/gemmini-rocc-tests
index 61dbeb6a..bc3cabb4 160000
--- a/software/gemmini-rocc-tests
+++ b/software/gemmini-rocc-tests
@@ -1 +1 @@
-Subproject commit 61dbeb6a92cf8386e5171da2d97bfe1ad5587151
+Subproject commit bc3cabb4081e3dbaa1d33119f0471357f2aa3927

From 9e478ecce9e48bbc03b9bd3535d71e03a6269fba Mon Sep 17 00:00:00 2001
From: Jerry Zhao <jerryz123@berkeley.edu>
Date: Wed, 22 Feb 2023 11:07:28 -0800
Subject: [PATCH 47/64] Avoid depending on testchipip

---
 src/main/scala/gemmini/DMA.scala | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/src/main/scala/gemmini/DMA.scala b/src/main/scala/gemmini/DMA.scala
index 3744282c..6d87a01e 100644
--- a/src/main/scala/gemmini/DMA.scala
+++ b/src/main/scala/gemmini/DMA.scala
@@ -8,8 +8,7 @@ import chisel3.experimental.DataMirror
 import freechips.rocketchip.config.Parameters
 import freechips.rocketchip.diplomacy.{IdRange, LazyModule, LazyModuleImp}
 import freechips.rocketchip.tile.{CoreBundle, HasCoreParameters}
-import freechips.rocketchip.tilelink.TLBundleA
-import testchipip.TLHelper
+import freechips.rocketchip.tilelink._
 import freechips.rocketchip.rocket.MStatus
 import freechips.rocketchip.rocket.constants.MemoryOpConstants
 
@@ -128,8 +127,8 @@ class StreamReaderCore[T <: Data, U <: Data, V <: Data](config: GemminiArrayConf
                                                         use_tlb_register_filter: Boolean,
                                                         use_firesim_simulation_counters: Boolean)
                                  (implicit p: Parameters) extends LazyModule {
-  val node = TLHelper.makeClientNode(
-    name = "stream-reader", sourceId = IdRange(0, nXacts))
+  val node = TLClientNode(Seq(TLMasterPortParameters.v1(Seq(TLClientParameters(
+    name = "stream-reader", sourceId = IdRange(0, nXacts))))))
 
   require(isPow2(aligned_to))
 
@@ -350,8 +349,8 @@ class StreamWriter[T <: Data: Arithmetic](nXacts: Int, beatBits: Int, maxBytes:
                                           inputType: T, block_cols: Int, use_tlb_register_filter: Boolean,
                                           use_firesim_simulation_counters: Boolean)
                   (implicit p: Parameters) extends LazyModule {
-  val node = TLHelper.makeClientNode(
-    name = "stream-writer", sourceId = IdRange(0, nXacts))
+  val node = TLClientNode(Seq(TLMasterPortParameters.v1(Seq(TLClientParameters(
+    name = "stream-writer", sourceId = IdRange(0, nXacts))))))
 
   require(isPow2(aligned_to))
 

From 1a8eaba5129afeed39147a833e9265079c151816 Mon Sep 17 00:00:00 2001
From: Jerry Zhao <jerryz123@berkeley.edu>
Date: Sat, 11 Mar 2023 10:32:29 -0800
Subject: [PATCH 48/64] Bump software to get gcc12 fix

---
 software/gemmini-rocc-tests | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/software/gemmini-rocc-tests b/software/gemmini-rocc-tests
index bc3cabb4..4a2f0061 160000
--- a/software/gemmini-rocc-tests
+++ b/software/gemmini-rocc-tests
@@ -1 +1 @@
-Subproject commit bc3cabb4081e3dbaa1d33119f0471357f2aa3927
+Subproject commit 4a2f00614d390baeb996919cb2bbc6d2b307b918

From a916bfb1a2bccb4a7caef289e22c5a1068cba54a Mon Sep 17 00:00:00 2001
From: abejgonzalez <abe.j.gonza@gmail.com>
Date: Sat, 11 Mar 2023 22:29:21 -0800
Subject: [PATCH 49/64] Add LeanGemminiConfig for tutorial

---
 src/main/scala/gemmini/Configs.scala | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/src/main/scala/gemmini/Configs.scala b/src/main/scala/gemmini/Configs.scala
index bd84b317..19fc1735 100644
--- a/src/main/scala/gemmini/Configs.scala
+++ b/src/main/scala/gemmini/Configs.scala
@@ -1,4 +1,3 @@
-
 package gemmini
 
 import chisel3._
@@ -256,6 +255,21 @@ class DefaultGemminiConfig[T <: Data : Arithmetic, U <: Data, V <: Data](
   )
 })
 
+/**
+ * Mixin which sets the default lean parameters for a systolic array accelerator.
+ */
+class LeanGemminiConfig[T <: Data : Arithmetic, U <: Data, V <: Data](
+  gemminiConfig: GemminiArrayConfig[T,U,V] = GemminiConfigs.leanConfig
+) extends Config((site, here, up) => {
+  case BuildRoCC => up(BuildRoCC) ++ Seq(
+    (p: Parameters) => {
+      implicit val q = p
+      val gemmini = LazyModule(new Gemmini(gemminiConfig))
+      gemmini
+    }
+  )
+})
+
 // This Gemmini config has both an Int and an FP Gemmini side-by-side, sharing
 // the same scratchpad.
 class DualGemminiConfig extends Config((site, here, up) => {

From e507416c38f63cd19eac9bf8c11c5087b3ba2624 Mon Sep 17 00:00:00 2001
From: Hasan Genc <hngenc@berkeley.edu>
Date: Sun, 12 Mar 2023 12:52:42 -0700
Subject: [PATCH 50/64] bump gemmini-rocc-tests

---
 software/gemmini-rocc-tests | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/software/gemmini-rocc-tests b/software/gemmini-rocc-tests
index 4a2f0061..b5e531e9 160000
--- a/software/gemmini-rocc-tests
+++ b/software/gemmini-rocc-tests
@@ -1 +1 @@
-Subproject commit 4a2f00614d390baeb996919cb2bbc6d2b307b918
+Subproject commit b5e531e98303b1ad98386cd4b82262a68e1d4ce6

From 5abbe54a0d6f8b3c86699bd559d082df2912ed41 Mon Sep 17 00:00:00 2001
From: Hasan Genc <hngenc@berkeley.edu>
Date: Sun, 12 Mar 2023 19:03:29 -0700
Subject: [PATCH 51/64] bump gemmini-rocc-tests to dev

---
 software/gemmini-rocc-tests | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/software/gemmini-rocc-tests b/software/gemmini-rocc-tests
index b5e531e9..13e7e1fc 160000
--- a/software/gemmini-rocc-tests
+++ b/software/gemmini-rocc-tests
@@ -1 +1 @@
-Subproject commit b5e531e98303b1ad98386cd4b82262a68e1d4ce6
+Subproject commit 13e7e1fce1a8d332eea563c14130136ef0533b16

From 686cb15dad756887db62460968bc616392bc4341 Mon Sep 17 00:00:00 2001
From: Sagar Karandikar <sagark@eecs.berkeley.edu>
Date: Thu, 23 Mar 2023 01:59:03 +0000
Subject: [PATCH 52/64] add printf config for tutorials

---
 src/main/scala/gemmini/Configs.scala | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/src/main/scala/gemmini/Configs.scala b/src/main/scala/gemmini/Configs.scala
index 19fc1735..8db9a0a7 100644
--- a/src/main/scala/gemmini/Configs.scala
+++ b/src/main/scala/gemmini/Configs.scala
@@ -236,6 +236,9 @@ object GemminiConfigs {
   )
 
   val leanConfig = defaultConfig.copy(dataflow=Dataflow.WS, max_in_flight_mem_reqs = 64, acc_read_full_width = false, ex_read_from_acc = false, ex_write_to_spad = false, hardcode_d_to_garbage_addr = true)
+
+  val leanPrintfConfig = defaultConfig.copy(dataflow=Dataflow.WS, max_in_flight_mem_reqs = 64, acc_read_full_width = false, ex_read_from_acc = false, ex_write_to_spad = false, hardcode_d_to_garbage_addr = true, use_firesim_simulation_counters=true)
+
 }
 
 /**
@@ -270,6 +273,18 @@ class LeanGemminiConfig[T <: Data : Arithmetic, U <: Data, V <: Data](
   )
 })
 
+class LeanGemminiPrintfConfig[T <: Data : Arithmetic, U <: Data, V <: Data](
+  gemminiConfig: GemminiArrayConfig[T,U,V] = GemminiConfigs.leanPrintfConfig
+) extends Config((site, here, up) => {
+  case BuildRoCC => up(BuildRoCC) ++ Seq(
+    (p: Parameters) => {
+      implicit val q = p
+      val gemmini = LazyModule(new Gemmini(gemminiConfig))
+      gemmini
+    }
+  )
+})
+
 // This Gemmini config has both an Int and an FP Gemmini side-by-side, sharing
 // the same scratchpad.
 class DualGemminiConfig extends Config((site, here, up) => {

From a9a92eabf77fff200dd3bebf84e83bd22858f178 Mon Sep 17 00:00:00 2001
From: Seah <54855793+SeahK@users.noreply.github.com>
Date: Mon, 27 Mar 2023 16:46:29 -0700
Subject: [PATCH 53/64] add dummy default config

---
 src/main/scala/gemmini/Configs.scala | 28 ++++++++++++++++++++--------
 1 file changed, 20 insertions(+), 8 deletions(-)

diff --git a/src/main/scala/gemmini/Configs.scala b/src/main/scala/gemmini/Configs.scala
index 19fc1735..9e7c4b99 100644
--- a/src/main/scala/gemmini/Configs.scala
+++ b/src/main/scala/gemmini/Configs.scala
@@ -172,15 +172,15 @@ object GemminiConfigs {
     meshRows     = defaultConfig.meshRows,
     meshColumns  = defaultConfig.meshColumns,
     dataflow     = defaultConfig.dataflow,
-    sp_capacity  = defaultConfig.sp_capacity,
-    acc_capacity = defaultConfig.acc_capacity,
+    sp_capacity  = CapacityInKilobytes(128),
+    acc_capacity = CapacityInKilobytes(128),
     sp_banks     = defaultConfig.sp_banks,
     acc_banks    = defaultConfig.acc_banks,
     sp_singleported = defaultConfig.sp_singleported,
     acc_singleported = defaultConfig.acc_singleported,
-    has_training_convs = defaultConfig.has_training_convs,
+    has_training_convs = false,
     has_max_pool = defaultConfig.has_max_pool,
-    has_nonlinear_activations = defaultConfig.has_nonlinear_activations,
+    has_nonlinear_activations = false,
     reservation_station_entries_ld = defaultConfig.reservation_station_entries_ld,
     reservation_station_entries_st = defaultConfig.reservation_station_entries_st,
     reservation_station_entries_ex = defaultConfig.reservation_station_entries_ex,
@@ -209,14 +209,14 @@ object GemminiConfigs {
       c_str = "({float y = ROUND_NEAR_EVEN((x) * (scale)); y > INT8_MAX ? INT8_MAX : (y < INT8_MIN ? INT8_MIN : (acc_t)y);})"
     )),
 
-    num_counter = defaultConfig.num_counter,
+    num_counter = 0,
 
-    acc_read_full_width = defaultConfig.acc_read_full_width,
+    acc_read_full_width = false,
     acc_read_small_width = defaultConfig.acc_read_small_width,
 
     ex_read_from_spad = defaultConfig.ex_read_from_spad,
-    ex_read_from_acc = defaultConfig.ex_read_from_acc,
-    ex_write_to_spad = defaultConfig.ex_write_to_spad,
+    ex_read_from_acc = false,
+    ex_write_to_spad = false,
     ex_write_to_acc = defaultConfig.ex_write_to_acc,
   )
 
@@ -270,6 +270,18 @@ class LeanGemminiConfig[T <: Data : Arithmetic, U <: Data, V <: Data](
   )
 })
 
+class DummyDefaultGemminiConfig[T <: Data : Arithmetic, U <: Data, V <: Data](
+  gemminiConfig: GemminiArrayConfig[T,U,V] = GemminiConfigs.dummyConfig
+) extends Config((site, here, up) => {
+  case BuildRoCC => up(BuildRoCC) ++ Seq(
+    (p: Parameters) => {
+      implicit val q = p
+      val gemmini = LazyModule(new Gemmini(gemminiConfig))
+      gemmini
+    }
+  )
+})
+
 // This Gemmini config has both an Int and an FP Gemmini side-by-side, sharing
 // the same scratchpad.
 class DualGemminiConfig extends Config((site, here, up) => {

From 8f5f6d556d1e67daecd6e5b75ff5c327e767d77e Mon Sep 17 00:00:00 2001
From: Jerry Zhao <jerryz123@berkeley.edu>
Date: Wed, 29 Mar 2023 11:47:51 -0700
Subject: [PATCH 54/64] Fix typo

---
 src/main/scala/gemmini/Configs.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/main/scala/gemmini/Configs.scala b/src/main/scala/gemmini/Configs.scala
index 960dac4d..b0d73764 100644
--- a/src/main/scala/gemmini/Configs.scala
+++ b/src/main/scala/gemmini/Configs.scala
@@ -273,7 +273,7 @@ class LeanGemminiConfig[T <: Data : Arithmetic, U <: Data, V <: Data](
   )
 })
 
-class LeanGemminiPrintfConfig[T <: Data : Arithmetic, U <: Data, V <: Data]
+class LeanGemminiPrintfConfig[T <: Data : Arithmetic, U <: Data, V <: Data](
   gemminiConfig: GemminiArrayConfig[T,U,V] = GemminiConfigs.leanPrintfConfig
 ) extends Config((site, here, up) => {
   case BuildRoCC => up(BuildRoCC) ++ Seq(

From 4dd19f3e93d2decc24048eb1776e53fb20855841 Mon Sep 17 00:00:00 2001
From: abejgonzalez <abe.j.gonza@gmail.com>
Date: Thu, 30 Mar 2023 23:19:56 -0700
Subject: [PATCH 55/64] Add smoke test | Support Gemmini spike

---
 software/gemmini-ort.json               |  4 ++--
 software/gemmini-smoke.json             | 10 ++++++++++
 software/gemmini-tests-full.json        |  3 ++-
 software/gemmini-tests-interactive.json |  3 ++-
 software/gemmini-tests.json             |  3 ++-
 software/overlay/root/run-test-smoke.sh |  9 +++++++++
 6 files changed, 27 insertions(+), 5 deletions(-)
 create mode 100644 software/gemmini-smoke.json
 create mode 100755 software/overlay/root/run-test-smoke.sh

diff --git a/software/gemmini-ort.json b/software/gemmini-ort.json
index c4a95253..a04c2869 100644
--- a/software/gemmini-ort.json
+++ b/software/gemmini-ort.json
@@ -53,6 +53,6 @@
   ],
   "overlay": "../onnxruntime-riscv/systolic_runner/imagenet_runner",
   "rootfs-size": "16GiB",
-  "run": "run-ort.sh"
+  "run": "run-ort.sh",
+  "spike-args": "--extension=gemmini"
 }
-
diff --git a/software/gemmini-smoke.json b/software/gemmini-smoke.json
new file mode 100644
index 00000000..cdfb4d13
--- /dev/null
+++ b/software/gemmini-smoke.json
@@ -0,0 +1,10 @@
+{
+  "name" : "gemmini-smoke",
+  "workdir" : ".",
+  "base" : "br-base.json",
+  "overlay" : "overlay",
+  "host-init" : "host-init.sh",
+  "command": "/root/run-test-smoke.sh",
+  "rootfs-size" : "16GiB",
+  "spike-args" : "--extension=gemmini"
+}
diff --git a/software/gemmini-tests-full.json b/software/gemmini-tests-full.json
index c4c56d29..6acfced1 100644
--- a/software/gemmini-tests-full.json
+++ b/software/gemmini-tests-full.json
@@ -4,5 +4,6 @@
   "base" : "br-base.json",
   "overlay" : "overlay",
   "host-init" : "host-init.sh",
-  "command": "/root/run-tests-full.sh"
+  "command": "/root/run-tests-full.sh",
+  "spike-args": "--extension=gemmini"
 }
diff --git a/software/gemmini-tests-interactive.json b/software/gemmini-tests-interactive.json
index 0fe52409..72eca491 100644
--- a/software/gemmini-tests-interactive.json
+++ b/software/gemmini-tests-interactive.json
@@ -4,5 +4,6 @@
   "base" : "br-base.json",
   "overlay" : "overlay",
   "host-init" : "host-init.sh",
-  "rootfs-size" : "16GiB"
+  "rootfs-size" : "16GiB",
+  "spike-args": "--extension=gemmini"
 }
diff --git a/software/gemmini-tests.json b/software/gemmini-tests.json
index fc0e45a9..754e35ed 100644
--- a/software/gemmini-tests.json
+++ b/software/gemmini-tests.json
@@ -5,5 +5,6 @@
   "overlay" : "overlay",
   "host-init" : "host-init.sh",
   "command": "/root/run-tests.sh",
-  "rootfs-size" : "16GiB"
+  "rootfs-size" : "16GiB",
+  "spike-args" : "--extension=gemmini"
 }
diff --git a/software/overlay/root/run-test-smoke.sh b/software/overlay/root/run-test-smoke.sh
new file mode 100755
index 00000000..7ba04632
--- /dev/null
+++ b/software/overlay/root/run-test-smoke.sh
@@ -0,0 +1,9 @@
+#!/usr/bin/env bash
+
+echo "*****************TEST RESULTS*************" > test_output.txt
+
+echo "========mobilenet========="
+/root/imagenet/mobilenet-linux >> test_output.txt
+
+cat test_output.txt
+poweroff -f

From e1f7eaa9292a02bb8e6699fc545c0400f065ebee Mon Sep 17 00:00:00 2001
From: Jerry Zhao <jerryz123@berkeley.edu>
Date: Mon, 20 Mar 2023 19:14:32 -0700
Subject: [PATCH 56/64] Bump to latest rocket-chip/chisel3.5.6

---
 build.sbt                                             | 4 ++--
 src/main/scala/gemmini/CmdFSM.scala                   | 2 +-
 src/main/scala/gemmini/Configs.scala                  | 2 +-
 src/main/scala/gemmini/ConfigsFP.scala                | 2 +-
 src/main/scala/gemmini/Controller.scala               | 2 +-
 src/main/scala/gemmini/CounterFile.scala              | 2 +-
 src/main/scala/gemmini/CustomConfigs.scala            | 2 +-
 src/main/scala/gemmini/CustomSoCConfigs.scala         | 2 +-
 src/main/scala/gemmini/DMA.scala                      | 2 +-
 src/main/scala/gemmini/DSEConfigs.scala               | 2 +-
 src/main/scala/gemmini/ExecuteController.scala        | 2 +-
 src/main/scala/gemmini/FrontendTLB.scala              | 2 +-
 src/main/scala/gemmini/InstructionCompression.scala   | 2 +-
 src/main/scala/gemmini/LoadController.scala           | 2 +-
 src/main/scala/gemmini/LoopConv.scala                 | 2 +-
 src/main/scala/gemmini/LoopMatmul.scala               | 2 +-
 src/main/scala/gemmini/LoopUnroller.scala             | 2 +-
 src/main/scala/gemmini/Scratchpad.scala               | 2 +-
 src/main/scala/gemmini/StoreController.scala          | 2 +-
 src/main/scala/gemmini/TilerController.scala          | 2 +-
 src/main/scala/gemmini/TilerFSM.scala                 | 2 +-
 src/main/scala/gemmini/TilerScheduler.scala           | 2 +-
 src/main/scala/gemmini/TransposePreloadUnroller.scala | 2 +-
 23 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/build.sbt b/build.sbt
index de8bd4e5..80242277 100644
--- a/build.sbt
+++ b/build.sbt
@@ -7,9 +7,9 @@ version := "3.1.0"
 scalaVersion := "2.13.10"
 
 libraryDependencies ++= Seq(
-  "edu.berkeley.cs" %% "chisel3" % "3.4.+",
+  "edu.berkeley.cs" %% "chisel3" % "3.5.6",
   "edu.berkeley.cs" %% "rocketchip" % "1.2.+",
-  "edu.berkeley.cs" %% "chisel-iotesters" % "1.5.+",
+  "edu.berkeley.cs" %% "chisel-iotesters" % "2.5.6",
   "org.scalanlp" %% "breeze" % "1.1")
 
 resolvers ++= Seq(
diff --git a/src/main/scala/gemmini/CmdFSM.scala b/src/main/scala/gemmini/CmdFSM.scala
index 8ee3a696..e5fdb832 100644
--- a/src/main/scala/gemmini/CmdFSM.scala
+++ b/src/main/scala/gemmini/CmdFSM.scala
@@ -9,7 +9,7 @@ package gemmini
 import chisel3._
 import chisel3.util._
 import chisel3.experimental._
-import freechips.rocketchip.config._
+import org.chipsalliance.cde.config._
 import freechips.rocketchip.rocket._
 import freechips.rocketchip.tile._
 import GemminiISA._
diff --git a/src/main/scala/gemmini/Configs.scala b/src/main/scala/gemmini/Configs.scala
index b0d73764..a849883d 100644
--- a/src/main/scala/gemmini/Configs.scala
+++ b/src/main/scala/gemmini/Configs.scala
@@ -1,7 +1,7 @@
 package gemmini
 
 import chisel3._
-import freechips.rocketchip.config.{Config, Parameters}
+import org.chipsalliance.cde.config.{Config, Parameters}
 import freechips.rocketchip.diplomacy.LazyModule
 import freechips.rocketchip.subsystem._
 import freechips.rocketchip.tile.{BuildRoCC, OpcodeSet, XLen}
diff --git a/src/main/scala/gemmini/ConfigsFP.scala b/src/main/scala/gemmini/ConfigsFP.scala
index c76907dd..ceb9d2a0 100644
--- a/src/main/scala/gemmini/ConfigsFP.scala
+++ b/src/main/scala/gemmini/ConfigsFP.scala
@@ -1,7 +1,7 @@
 package gemmini
 
 import chisel3._
-import freechips.rocketchip.config.{Config, Parameters}
+import org.chipsalliance.cde.config.{Config, Parameters}
 import freechips.rocketchip.diplomacy.{LazyModule, ValName}
 import freechips.rocketchip.subsystem._
 import freechips.rocketchip.tile.{BuildRoCC, OpcodeSet}
diff --git a/src/main/scala/gemmini/Controller.scala b/src/main/scala/gemmini/Controller.scala
index d4c5f7d5..0fdda55f 100644
--- a/src/main/scala/gemmini/Controller.scala
+++ b/src/main/scala/gemmini/Controller.scala
@@ -6,7 +6,7 @@ import java.nio.file.{Files, Paths}
 
 import chisel3._
 import chisel3.util._
-import freechips.rocketchip.config._
+import org.chipsalliance.cde.config._
 import freechips.rocketchip.diplomacy._
 import freechips.rocketchip.tile._
 import freechips.rocketchip.util.ClockGate
diff --git a/src/main/scala/gemmini/CounterFile.scala b/src/main/scala/gemmini/CounterFile.scala
index 7b28b8e2..055ab1f3 100644
--- a/src/main/scala/gemmini/CounterFile.scala
+++ b/src/main/scala/gemmini/CounterFile.scala
@@ -2,7 +2,7 @@ package gemmini
 
 import chisel3._
 import chisel3.util._
-import freechips.rocketchip.config._
+import org.chipsalliance.cde.config._
 import freechips.rocketchip.diplomacy._
 import freechips.rocketchip.tile._
 import freechips.rocketchip.tilelink.{TLIdentityNode}
diff --git a/src/main/scala/gemmini/CustomConfigs.scala b/src/main/scala/gemmini/CustomConfigs.scala
index ae529a69..011d7ce1 100644
--- a/src/main/scala/gemmini/CustomConfigs.scala
+++ b/src/main/scala/gemmini/CustomConfigs.scala
@@ -1,6 +1,6 @@
 package gemmini
 
-import chipsalliance.rocketchip.config.{Config, Parameters}
+import org.chipsalliance.cde.config.{Config, Parameters}
 import chisel3._
 import freechips.rocketchip.diplomacy.LazyModule
 import freechips.rocketchip.subsystem.SystemBusKey
diff --git a/src/main/scala/gemmini/CustomSoCConfigs.scala b/src/main/scala/gemmini/CustomSoCConfigs.scala
index 057aa1e1..58636db1 100644
--- a/src/main/scala/gemmini/CustomSoCConfigs.scala
+++ b/src/main/scala/gemmini/CustomSoCConfigs.scala
@@ -1,7 +1,7 @@
 /*
 package chipyard
 
-import freechips.rocketchip.config.{Config}
+import org.chipsalliance.cde.config.{Config}
 
 class CustomGemminiSoCConfig extends Config(
   new gemmini.GemminiCustomConfig ++
diff --git a/src/main/scala/gemmini/DMA.scala b/src/main/scala/gemmini/DMA.scala
index 6d87a01e..1fd0be82 100644
--- a/src/main/scala/gemmini/DMA.scala
+++ b/src/main/scala/gemmini/DMA.scala
@@ -5,7 +5,7 @@ import chisel3._
 import chisel3.util._
 import chisel3.experimental.DataMirror
 
-import freechips.rocketchip.config.Parameters
+import org.chipsalliance.cde.config.Parameters
 import freechips.rocketchip.diplomacy.{IdRange, LazyModule, LazyModuleImp}
 import freechips.rocketchip.tile.{CoreBundle, HasCoreParameters}
 import freechips.rocketchip.tilelink._
diff --git a/src/main/scala/gemmini/DSEConfigs.scala b/src/main/scala/gemmini/DSEConfigs.scala
index 257721ca..26b8c3aa 100644
--- a/src/main/scala/gemmini/DSEConfigs.scala
+++ b/src/main/scala/gemmini/DSEConfigs.scala
@@ -2,7 +2,7 @@
 package gemmini
 
 import chisel3._
-import freechips.rocketchip.config.{Config, Parameters}
+import org.chipsalliance.cde.config.{Config, Parameters}
 import freechips.rocketchip.diplomacy.{LazyModule, ValName}
 import freechips.rocketchip.subsystem._
 import freechips.rocketchip.tile.{BuildRoCC, OpcodeSet}
diff --git a/src/main/scala/gemmini/ExecuteController.scala b/src/main/scala/gemmini/ExecuteController.scala
index 65add720..13f45eea 100644
--- a/src/main/scala/gemmini/ExecuteController.scala
+++ b/src/main/scala/gemmini/ExecuteController.scala
@@ -5,7 +5,7 @@ import chisel3._
 import chisel3.util._
 import GemminiISA._
 import Util._
-import freechips.rocketchip.config.Parameters
+import org.chipsalliance.cde.config.Parameters
 import midas.targetutils.PerfCounter
 
 // TODO do we still need to flush when the dataflow is weight stationary? Won't the result just keep travelling through on its own?
diff --git a/src/main/scala/gemmini/FrontendTLB.scala b/src/main/scala/gemmini/FrontendTLB.scala
index 63a41496..4a2e3d21 100644
--- a/src/main/scala/gemmini/FrontendTLB.scala
+++ b/src/main/scala/gemmini/FrontendTLB.scala
@@ -3,7 +3,7 @@ package gemmini
 import chisel3._
 import chisel3.util._
 
-import freechips.rocketchip.config.Parameters
+import org.chipsalliance.cde.config.Parameters
 import freechips.rocketchip.rocket._
 import freechips.rocketchip.tile.{CoreBundle, CoreModule}
 import freechips.rocketchip.tilelink.TLEdgeOut
diff --git a/src/main/scala/gemmini/InstructionCompression.scala b/src/main/scala/gemmini/InstructionCompression.scala
index fe6cd3d9..64ab15dc 100644
--- a/src/main/scala/gemmini/InstructionCompression.scala
+++ b/src/main/scala/gemmini/InstructionCompression.scala
@@ -4,7 +4,7 @@ import chisel3._
 import chisel3.util._
 
 import freechips.rocketchip.tile.RoCCCommand
-import freechips.rocketchip.config.Parameters
+import org.chipsalliance.cde.config.Parameters
 
 import GemminiISA._
 import Util._
diff --git a/src/main/scala/gemmini/LoadController.scala b/src/main/scala/gemmini/LoadController.scala
index 71ecf7c7..4e5132f7 100644
--- a/src/main/scala/gemmini/LoadController.scala
+++ b/src/main/scala/gemmini/LoadController.scala
@@ -5,7 +5,7 @@ import chisel3._
 import chisel3.util._
 import GemminiISA._
 import Util._
-import freechips.rocketchip.config.Parameters
+import org.chipsalliance.cde.config.Parameters
 import midas.targetutils.PerfCounter
 
 // TODO we need to check for WAW errors here
diff --git a/src/main/scala/gemmini/LoopConv.scala b/src/main/scala/gemmini/LoopConv.scala
index 53032a51..db6866a9 100644
--- a/src/main/scala/gemmini/LoopConv.scala
+++ b/src/main/scala/gemmini/LoopConv.scala
@@ -5,7 +5,7 @@ import chisel3._
 import chisel3.util._
 import chisel3.experimental._
 import freechips.rocketchip.tile.RoCCCommand
-import freechips.rocketchip.config.Parameters
+import org.chipsalliance.cde.config.Parameters
 import GemminiISA._
 import LocalAddr._
 import Util._
diff --git a/src/main/scala/gemmini/LoopMatmul.scala b/src/main/scala/gemmini/LoopMatmul.scala
index a33155e9..c9e6fed3 100644
--- a/src/main/scala/gemmini/LoopMatmul.scala
+++ b/src/main/scala/gemmini/LoopMatmul.scala
@@ -5,7 +5,7 @@ import chisel3._
 import chisel3.util._
 import chisel3.experimental._
 import freechips.rocketchip.tile.RoCCCommand
-import freechips.rocketchip.config.Parameters
+import org.chipsalliance.cde.config.Parameters
 import GemminiISA._
 import LocalAddr._
 import Util._
diff --git a/src/main/scala/gemmini/LoopUnroller.scala b/src/main/scala/gemmini/LoopUnroller.scala
index 63a0150b..75ad7d3a 100644
--- a/src/main/scala/gemmini/LoopUnroller.scala
+++ b/src/main/scala/gemmini/LoopUnroller.scala
@@ -5,7 +5,7 @@ import chisel3.util._
 import chisel3.experimental._
 
 import freechips.rocketchip.tile.RoCCCommand
-import freechips.rocketchip.config.Parameters
+import org.chipsalliance.cde.config.Parameters
 
 import GemminiISA._
 import Util._
diff --git a/src/main/scala/gemmini/Scratchpad.scala b/src/main/scala/gemmini/Scratchpad.scala
index f6e40187..ceb8b0ae 100644
--- a/src/main/scala/gemmini/Scratchpad.scala
+++ b/src/main/scala/gemmini/Scratchpad.scala
@@ -3,7 +3,7 @@ package gemmini
 
 import chisel3._
 import chisel3.util._
-import freechips.rocketchip.config.Parameters
+import org.chipsalliance.cde.config.Parameters
 import freechips.rocketchip.diplomacy.{LazyModule, LazyModuleImp}
 import freechips.rocketchip.rocket._
 import freechips.rocketchip.tile._
diff --git a/src/main/scala/gemmini/StoreController.scala b/src/main/scala/gemmini/StoreController.scala
index bf4a71b2..72cd761b 100644
--- a/src/main/scala/gemmini/StoreController.scala
+++ b/src/main/scala/gemmini/StoreController.scala
@@ -6,7 +6,7 @@ import chisel3.util._
 import chisel3.experimental._
 import GemminiISA._
 import Util._
-import freechips.rocketchip.config.Parameters
+import org.chipsalliance.cde.config.Parameters
 import midas.targetutils.PerfCounter
 
 // TODO this is almost a complete copy of LoadController. We should combine them into one class
diff --git a/src/main/scala/gemmini/TilerController.scala b/src/main/scala/gemmini/TilerController.scala
index f3275790..45930007 100644
--- a/src/main/scala/gemmini/TilerController.scala
+++ b/src/main/scala/gemmini/TilerController.scala
@@ -6,7 +6,7 @@ package gemmini
 import chisel3._
 import chisel3.util._
 import chisel3.experimental._
-import freechips.rocketchip.config._
+import org.chipsalliance.cde.config._
 import freechips.rocketchip.rocket._
 import freechips.rocketchip.tile._
 
diff --git a/src/main/scala/gemmini/TilerFSM.scala b/src/main/scala/gemmini/TilerFSM.scala
index db400f96..010bed9a 100644
--- a/src/main/scala/gemmini/TilerFSM.scala
+++ b/src/main/scala/gemmini/TilerFSM.scala
@@ -6,7 +6,7 @@ package gemmini
 import chisel3._
 import chisel3.util._
 import chisel3.experimental._
-import freechips.rocketchip.config._
+import org.chipsalliance.cde.config._
 import freechips.rocketchip.tile._
 import GemminiISA._
 import Util.regwire
diff --git a/src/main/scala/gemmini/TilerScheduler.scala b/src/main/scala/gemmini/TilerScheduler.scala
index d9fd8b6a..0b273a5a 100644
--- a/src/main/scala/gemmini/TilerScheduler.scala
+++ b/src/main/scala/gemmini/TilerScheduler.scala
@@ -5,7 +5,7 @@ package gemmini
 
 import chisel3._
 import chisel3.util._
-import freechips.rocketchip.config._
+import org.chipsalliance.cde.config._
 import freechips.rocketchip.tile._
 import GemminiISA._
 import Util._
diff --git a/src/main/scala/gemmini/TransposePreloadUnroller.scala b/src/main/scala/gemmini/TransposePreloadUnroller.scala
index 18d7320d..68407344 100644
--- a/src/main/scala/gemmini/TransposePreloadUnroller.scala
+++ b/src/main/scala/gemmini/TransposePreloadUnroller.scala
@@ -3,7 +3,7 @@ package gemmini
 import chisel3._
 import chisel3.util._
 import chisel3.experimental.ChiselEnum
-import chipsalliance.rocketchip.config.Parameters
+import org.chipsalliance.cde.config.Parameters
 import Util._
 import midas.targetutils.PerfCounter
 

From b51f7a1af48f2c9ff91bcce8b4adc7f2336dff92 Mon Sep 17 00:00:00 2001
From: Jerry Zhao <jerryz123@berkeley.edu>
Date: Mon, 20 Mar 2023 23:48:01 -0700
Subject: [PATCH 57/64] Bump chipyard hash

---
 CHIPYARD.hash | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHIPYARD.hash b/CHIPYARD.hash
index 747b6b18..b51f4be7 100644
--- a/CHIPYARD.hash
+++ b/CHIPYARD.hash
@@ -1 +1 @@
-bcbe3b7f1f40d1c388aca68df498fd7dd4d16e89
+52671ba069ae17c68a611b1e0bffc692d8cb2f4a

From 965ea0b3c5ffd7b68e6bf5a50ea9ff0750a5b988 Mon Sep 17 00:00:00 2001
From: Jerry Zhao <jerryz123@berkeley.edu>
Date: Fri, 31 Mar 2023 17:26:07 -0700
Subject: [PATCH 58/64] Fix LoopConv variable naming

---
 src/main/scala/gemmini/LoopConv.scala | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/main/scala/gemmini/LoopConv.scala b/src/main/scala/gemmini/LoopConv.scala
index db6866a9..bc87ae10 100644
--- a/src/main/scala/gemmini/LoopConv.scala
+++ b/src/main/scala/gemmini/LoopConv.scala
@@ -661,12 +661,12 @@ class LoopConvExecute(block_size: Int, large_iterator_bitwidth: Int, small_itera
 
   // val new_weights = b === 0.U && orow === 0.U && ocol === 0.U
   val new_weights = Reg(Bool())
-  val krow_ = Mux(req.wrot180, krows - krow - 1.U, krow)
-  val kcol_ = Mux(req.wrot180, kcols - kcol - 1.U, kcol)
+  val krow_rot = Mux(req.wrot180, krows - krow - 1.U, krow)
+  val kcol_rot = Mux(req.wrot180, kcols - kcol - 1.U, kcol)
 
   val b_addr = Mux(req.trans_weight_0132,
-    b_addr_start +& (kch / block_size.U(och.getWidth.W)) * krows * kcols * ochs +& krow_ * kcols * ochs +& kcol_ * ochs +& och,
-    b_addr_start +& (och / block_size.U(och.getWidth.W)) * krows * kcols * kchs +& krow_ * kcols * kchs +& kcol_ * kchs +& kch)
+    b_addr_start +& (kch / block_size.U(och.getWidth.W)) * krows * kcols * ochs +& krow_rot * kcols * ochs +& kcol_rot * ochs +& och,
+    b_addr_start +& (och / block_size.U(och.getWidth.W)) * krows * kcols * kchs +& krow_rot * kcols * kchs +& kcol_rot * kchs +& kch)
 
   class RoCCCommandWithAddr extends Bundle {
     val cmd = new RoCCCommand

From 934058a2e37b38682fc38db13c75fafcbb20a619 Mon Sep 17 00:00:00 2001
From: Jerry Zhao <jerryz123@berkeley.edu>
Date: Tue, 4 Apr 2023 11:35:45 -0700
Subject: [PATCH 59/64] Bump Chipyard

---
 CHIPYARD.hash | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHIPYARD.hash b/CHIPYARD.hash
index b51f4be7..ed413fdd 100644
--- a/CHIPYARD.hash
+++ b/CHIPYARD.hash
@@ -1 +1 @@
-52671ba069ae17c68a611b1e0bffc692d8cb2f4a
+569917e2f30616f85a841d16a92914ae98ad7184

From 9debd98b03b2835538ea47acdf391d73d816ee67 Mon Sep 17 00:00:00 2001
From: Hasan Genc <hngenc@berkeley.edu>
Date: Sun, 7 May 2023 19:58:40 -0700
Subject: [PATCH 60/64] fix simulator args in run-midas.sh

---
 scripts/run-midas.sh | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/scripts/run-midas.sh b/scripts/run-midas.sh
index 63616809..8de7a0fc 100755
--- a/scripts/run-midas.sh
+++ b/scripts/run-midas.sh
@@ -120,7 +120,17 @@ if [ ! -f ./${simulator}${DEBUG} ]; then
   echo 'Did you run `./scripts/build-midas.sh`?'
 fi
 
+sim_args="+vcs+initreg+0 +vcs+initmem+0 +fesvr-step-size=128 +mm_relaxFunctionalModel_0=0 +mm_openPagePolicy_0=1 +mm_backendLatency_0=2 +mm_dramTimings_tAL_0=0 +mm_dramTimings_tCAS_0=14 +mm_dramTimings_tCMD_0=1 +mm_dramTimings_tCWD_0=10 +mm_dramTimings_tCCD_0=4 +mm_dramTimings_tFAW_0=25 +mm_dramTimings_tRAS_0=33 +mm_dramTimings_tREFI_0=7800 +mm_dramTimings_tRC_0=47 +mm_dramTimings_tRCD_0=14 +mm_dramTimings_tRFC_0=160 +mm_dramTimings_tRRD_0=8 +mm_dramTimings_tRP_0=14 +mm_dramTimings_tRTP_0=8 +mm_dramTimings_tRTRS_0=2 +mm_dramTimings_tWR_0=15 +mm_dramTimings_tWTR_0=8 +mm_rowAddr_offset_0=18 +mm_rowAddr_mask_0=65535 +mm_rankAddr_offset_0=16 +mm_rankAddr_mask_0=3 +mm_bankAddr_offset_0=13 +mm_bankAddr_mask_0=7 +shmemportname0=0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 +macaddr0=00:00:00:00:00:02 +niclog0=niclog0 +linklatency0=6405 +netbw0=100 +netburst0=8 +nic-loopback0 +tracefile=TRACEFILE +blkdev-in-mem0=128 +blkdev-log0=blkdev-log0 +autocounter-readrate=1000 +autocounter-filename=AUTOCOUNTERFILE +max-cycles=100000000 +dramsim +dramsim_ini_dir=/home/eecs/hngenc/chip/generators/testchipip/src/main/resources/dramsim2_ini"
+
+if [ $dram_model == "DDR3FRFCFS" ] || [ $dram_model == "DDR3FRFCFSLLC4MB" ]; then
+    sim_args="$sim_args +mm_schedulerWindowSize_0=8 +mm_transactionQueueDepth_0=8"
+fi
+
+if [ $dram_model == "DDR3FRFCFSLLC4MB" ]; then
+    sim_args="$sim_args +mm_llc_wayBits_0=3 +mm_llc_setBits_0=12 +mm_llc_blockBits_0=7 +mm_llc_activeMSHRs_0=8"
+fi
+
 ./${simulator}${DEBUG} ${PK} ${full_binary_path} ${waveform_flag} \
-    +vcs+initreg+0 +vcs+initmem+0 +fesvr-step-size=128 +mm_relaxFunctionalModel_0=0 +mm_openPagePolicy_0=1 +mm_backendLatency_0=2 +mm_schedulerWindowSize_0=8 +mm_transactionQueueDepth_0=8 +mm_dramTimings_tAL_0=0 +mm_dramTimings_tCAS_0=14 +mm_dramTimings_tCMD_0=1 +mm_dramTimings_tCWD_0=10 +mm_dramTimings_tCCD_0=4 +mm_dramTimings_tFAW_0=25 +mm_dramTimings_tRAS_0=33 +mm_dramTimings_tREFI_0=7800 +mm_dramTimings_tRC_0=47 +mm_dramTimings_tRCD_0=14 +mm_dramTimings_tRFC_0=160 +mm_dramTimings_tRRD_0=8 +mm_dramTimings_tRP_0=14 +mm_dramTimings_tRTP_0=8 +mm_dramTimings_tRTRS_0=2 +mm_dramTimings_tWR_0=15 +mm_dramTimings_tWTR_0=8 +mm_rowAddr_offset_0=18 +mm_rowAddr_mask_0=65535 +mm_rankAddr_offset_0=16 +mm_rankAddr_mask_0=3 +mm_bankAddr_offset_0=13 +mm_bankAddr_mask_0=7 +mm_llc_wayBits_0=3 +mm_llc_setBits_0=12 +mm_llc_blockBits_0=7 +mm_llc_activeMSHRs_0=8 +shmemportname0=0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 +macaddr0=00:00:00:00:00:02 +niclog0=niclog0 +linklatency0=6405 +netbw0=100 +netburst0=8 +nic-loopback0 +tracefile=TRACEFILE +blkdev-in-mem0=128 +blkdev-log0=blkdev-log0 +autocounter-readrate=1000 +autocounter-filename=AUTOCOUNTERFILE +max-cycles=100000000 \
-    +dramsim +dramsim_ini_dir=/home/eecs/hngenc/chip/generators/testchipip/src/main/resources/dramsim2_ini \
+    $sim_args \
     2>/dev/null
+

From 7c16dcf349e84c23f2af5176043185e3046b909b Mon Sep 17 00:00:00 2001
From: Hasan Genc <hngenc@berkeley.edu>
Date: Tue, 9 May 2023 22:10:41 -0700
Subject: [PATCH 61/64] set system-bus-width to match Gemmini's dma-bus-width

---
 README.md                                     | 2 +-
 src/main/scala/gemmini/CustomSoCConfigs.scala | 4 +++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 4caf35bb..1a1db0ce 100644
--- a/README.md
+++ b/README.md
@@ -32,7 +32,7 @@ Run these steps to install Chipyard and Spike (make sure to checkout the correct
 ```shell
 git clone https://github.com/ucb-bar/chipyard.git
 cd chipyard
-git checkout 1.8.1
+git checkout 1.9.1
 ./build-setup.sh riscv-tools
 
 source env.sh
diff --git a/src/main/scala/gemmini/CustomSoCConfigs.scala b/src/main/scala/gemmini/CustomSoCConfigs.scala
index 58636db1..cdf2fd54 100644
--- a/src/main/scala/gemmini/CustomSoCConfigs.scala
+++ b/src/main/scala/gemmini/CustomSoCConfigs.scala
@@ -1,7 +1,8 @@
 /*
 package chipyard
 
-import org.chipsalliance.cde.config.{Config}
+import org.chipsalliance.cde.config.Config
+import gemmini.{GemminiCustomConfig, GemminiCustomConfigs}
 
 class CustomGemminiSoCConfig extends Config(
   new gemmini.GemminiCustomConfig ++
@@ -19,6 +20,7 @@ class CustomGemminiSoCConfig extends Config(
   // Set the number of CPUs you want to create
   new chipyard.CustomGemmminiCPUConfigs.CustomCPU(1) ++
 
+  new chipyard.config.WithSystemBusWidth(GemminiCustomConfigs.customConfig.dma_buswidth) ++
   new chipyard.config.AbstractConfig
 )
 */

From 4070e2c9dcb16d24445ceabec9e5b4a7215e5ba7 Mon Sep 17 00:00:00 2001
From: Hasan Genc <hngenc@berkeley.edu>
Date: Tue, 9 May 2023 22:53:31 -0700
Subject: [PATCH 62/64] update readme to include scratchpad-memory stride in
 config-mvin

---
 README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 1a1db0ce..e05dadb1 100644
--- a/README.md
+++ b/README.md
@@ -496,8 +496,9 @@ This limitation may be lifted in the future.
 - `rs1[1:0]` must be `01`
 - `rs1[2]` is 0 if `mvin`s to the accumulator are of type `accType`, and 1 if they are `inputType`
 - `rs1[4:3]` is 0 if the stride is being set for `mvin`, 1 if the stride is being set for `mvin2`, and 2 if the stride is being set for `mvin3`
+- `rs1[31:16]` is the scratchpad-memory stride (also called the "private-memory stride" above)
 - `rs1[63:32]` is the "scale" by which to multiply data as it's being moved in to the scratchpad. This is ignored if Gemmini isn't configured to have the ability to scale values during `mvin`s.
-- `rs2` = the stride in bytes
+- `rs2` is the main-memory stride in bytes
 - `funct` = 0
 
 **Action:** stride <= rs2; scale <= rs1[63:32]

From d48db378f5f7dabeb677cfc0536aecb7711e37ad Mon Sep 17 00:00:00 2001
From: Charles Hong <31425218+charleshong3@users.noreply.github.com>
Date: Wed, 10 May 2023 10:23:29 -0700
Subject: [PATCH 63/64] add timeloop arch and mapspace YAMLs (#279)

---
 modeling/timeloop/arch/arch_default.yaml | 64 ++++++++++++++++++++++++
 modeling/timeloop/mapspace/mapspace.yaml | 50 ++++++++++++++++++
 2 files changed, 114 insertions(+)
 create mode 100644 modeling/timeloop/arch/arch_default.yaml
 create mode 100644 modeling/timeloop/mapspace/mapspace.yaml

diff --git a/modeling/timeloop/arch/arch_default.yaml b/modeling/timeloop/arch/arch_default.yaml
new file mode 100644
index 00000000..84c7ca1c
--- /dev/null
+++ b/modeling/timeloop/arch/arch_default.yaml
@@ -0,0 +1,64 @@
+# A Timeloop-compatible YAML definition of an architecture similar to Gemmini
+# WARNING: Not correlated with Gemmini RTL
+architecture:
+  version: 0.3
+
+  subtree:
+  - name: System
+    local:
+    - name: DRAM
+      class: DRAM
+      attributes:
+        instances: 1
+        word-bits: 8
+        block_size: 64
+        shared_bandwidth: 8
+
+    subtree:
+    - name: Chip
+      attributes:
+        technology: 40nm
+      local:
+      - name: Scratchpad 
+        class: SRAM
+        attributes:
+          entries: 262144
+          depth: 16384
+          width: 128
+          instances: 1
+          meshX: 1
+          word-bits: 8
+          n_rdwr_ports: 2
+          n_banks: 4
+      subtree:
+      - name: PECols[0..15] # only the K dim can be parallelized across Accumulator columns
+        local:
+        - name: Accumulator
+          class: SRAM
+          attributes:
+            entries: 1024 # acc size / pe_dim = 16384/16
+            depth: 1024
+            width: 32
+            instances: 16
+            word-bits: 32
+            network-word-bits: 16
+            n_rdwr_ports: 2
+            n_banks: 2
+        subtree:
+        - name: PERows[0..15]
+          local:
+          - name: Registers
+            class: SRAM
+            attributes:
+              depth: 1
+              width: 8
+              entries: 1
+              instances: 256
+              word-bits: 8
+              n_rdwr_ports: 2
+              n_banks: 1
+          - name: MACC
+            class: intmac
+            attributes:
+              datawidth: 8
+              word-bits: 8
diff --git a/modeling/timeloop/mapspace/mapspace.yaml b/modeling/timeloop/mapspace/mapspace.yaml
new file mode 100644
index 00000000..a4cf3327
--- /dev/null
+++ b/modeling/timeloop/mapspace/mapspace.yaml
@@ -0,0 +1,50 @@
+mapspace_constraints:
+  - target: Registers
+    type: bypass
+    bypass:
+    - Outputs
+    - Inputs
+    keep:
+    - Weights
+  - target: Accumulator
+    type: bypass
+    bypass:
+    - Weights
+    - Inputs
+    keep:
+    - Outputs
+  - target: Scratchpad
+    type: bypass
+    keep:
+    - Inputs
+    - Weights
+    bypass:
+    - Outputs
+  - target: DRAM
+    type: bypass
+    keep:
+    - Weights
+    - Inputs
+    - Outputs
+    bypass: []
+  - target: Registers
+    type: temporal
+    factors: R=1 S=1 P=1 Q=1 C=1 K=1 N=1
+    permutation: PQRSCKN
+  - target: Accumulator
+    type: spatial
+    factors: R=1 S=1 P=1 Q=1 C<=16 K=1 N=1
+    permutation: QKC
+  - target: Accumulator
+    type: temporal
+    permutation: QPNCSRK
+  - target: Scratchpad
+    type: spatial
+    factors: R=1 S=1 P=1 Q=1 N=1 C=1 K<=16
+  - target: Scratchpad
+    type: temporal
+    factors: R=1 S=1 P=1 Q=1 C=1 K=1 N=1
+  - target: DRAM
+    type: temporal
+    permutation: CSRKQPN
+

From 2efd84adc4d1f320187845060375507f51db983b Mon Sep 17 00:00:00 2001
From: Hasan Genc <hngenc@berkeley.edu>
Date: Mon, 22 May 2023 13:20:42 -0700
Subject: [PATCH 64/64] Update checkout instruction in README

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index e05dadb1..7d6eb790 100644
--- a/README.md
+++ b/README.md
@@ -39,7 +39,7 @@ source env.sh
 
 cd generators/gemmini
 git config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*"
-git checkout dev && git pull origin dev
+git fetch && git checkout v0.7.1
 git submodule update --init --recursive
 
 make -C software/libgemmini install