diff --git a/CHIPYARD.hash b/CHIPYARD.hash
index cba05d74..f41949c3 100644
--- a/CHIPYARD.hash
+++ b/CHIPYARD.hash
@@ -1 +1 @@
-ec1b075658fb92a624151536dd1de76bad94f51f
+117624d8eea27bafd613eec09e9b9b3e31239e08
diff --git a/README.md b/README.md
index 515264b8..5a595622 100644
--- a/README.md
+++ b/README.md
@@ -32,19 +32,19 @@ Run these steps to install Chipyard and Spike (make sure to checkout the correct
 ```shell
 git clone https://github.com/ucb-bar/chipyard.git
 cd chipyard
-git checkout ec1b075658fb92a624151536dd1de76bad94f51f
+git checkout 117624d8eea27bafd613eec09e9b9b3e31239e08
 ./scripts/init-submodules-no-riscv-tools.sh
 ./scripts/build-toolchains.sh esp-tools
 
 source env.sh
 
 cd generators/gemmini
-git fetch && git checkout v0.6.2
+git fetch && git checkout v0.6.3
 git submodule update
 
 cd -
 cd toolchains/esp-tools/riscv-isa-sim/build
-git fetch && git checkout 79486d67f99fa739c8c1d5916c9b74d0417b53c4
+git fetch && git checkout 090e82c473fd28b4eb2011ffcd771ead6076faab
 make && make install
 ```
 
diff --git a/SPIKE.hash b/SPIKE.hash
index d0b80362..f08ac921 100644
--- a/SPIKE.hash
+++ b/SPIKE.hash
@@ -1 +1 @@
-79486d67f99fa739c8c1d5916c9b74d0417b53c4
+090e82c473fd28b4eb2011ffcd771ead6076faab
diff --git a/software/gemmini-rocc-tests b/software/gemmini-rocc-tests
index 96165453..e326e7c4 160000
--- a/software/gemmini-rocc-tests
+++ b/software/gemmini-rocc-tests
@@ -1 +1 @@
-Subproject commit 96165453758f047432a94431d4779f9f331b5e5a
+Subproject commit e326e7c43457ff08669fe88edcaa395d846474d8
diff --git a/src/main/scala/gemmini/AccumulatorMem.scala b/src/main/scala/gemmini/AccumulatorMem.scala
index 8f3fbaf5..18b53624 100644
--- a/src/main/scala/gemmini/AccumulatorMem.scala
+++ b/src/main/scala/gemmini/AccumulatorMem.scala
@@ -14,7 +14,6 @@ class AccumulatorReadReq[T <: Data](n: Int, shift_width: Int, scale_t: T) extend
 
   val fromDMA = Bool()
 
-  override def cloneType: this.type = new AccumulatorReadReq(n, shift_width, scale_t.cloneType).asInstanceOf[this.type]
 }
 
 class AccumulatorReadResp[T <: Data: Arithmetic, U <: Data](fullDataType: Vec[Vec[T]], scale_t: U, shift_width: Int) extends Bundle {
@@ -24,14 +23,12 @@ class AccumulatorReadResp[T <: Data: Arithmetic, U <: Data](fullDataType: Vec[Ve
   val relu6_shift = UInt(shift_width.W)
   val act = UInt(2.W) // TODO magic number
   val acc_bank_id = UInt(2.W) // TODO don't hardcode
-  override def cloneType: this.type = new AccumulatorReadResp(fullDataType.cloneType, scale_t, shift_width).asInstanceOf[this.type]
 }
 
 class AccumulatorReadIO[T <: Data: Arithmetic, U <: Data](n: Int, shift_width: Int, fullDataType: Vec[Vec[T]], scale_t: U) extends Bundle {
   val req = Decoupled(new AccumulatorReadReq[U](n, shift_width, scale_t))
   val resp = Flipped(Decoupled(new AccumulatorReadResp[T, U](fullDataType, scale_t, shift_width)))
 
-  override def cloneType: this.type = new AccumulatorReadIO(n, shift_width, fullDataType.cloneType, scale_t.cloneType).asInstanceOf[this.type]
 }
 
 class AccumulatorWriteReq[T <: Data: Arithmetic](n: Int, t: Vec[Vec[T]]) extends Bundle {
@@ -41,7 +38,6 @@ class AccumulatorWriteReq[T <: Data: Arithmetic](n: Int, t: Vec[Vec[T]]) extends
   val mask = Vec(t.getWidth / 8, Bool()) // TODO Use aligned_to here
   // val current_waddr = Flipped(Valid(UInt(log2Ceil(n).W))) // This is the raddr that is being fed into the SRAM right now
 
-  override def cloneType: this.type = new AccumulatorWriteReq(n, t).asInstanceOf[this.type]
 }
 
 
@@ -60,7 +56,6 @@ class AccumulatorMemIO [T <: Data: Arithmetic, U <: Data](n: Int, t: Vec[Vec[T]]
     val sum = Input(t.cloneType)
   }
 
-  override def cloneType: this.type = new AccumulatorMemIO(n, t, scale_t, acc_sub_banks, use_shared_ext_mem).asInstanceOf[this.type]
 }
 
 class AccPipe[T <: Data : Arithmetic](latency: Int, t: T)(implicit ev: Arithmetic[T]) extends Module {
@@ -117,7 +112,7 @@ class AccumulatorMem[T <: Data, U <: Data](
 
   val pipelined_writes = Reg(Vec(acc_latency, Valid(new AccumulatorWriteReq(n, t))))
   val oldest_pipelined_write = pipelined_writes(acc_latency-1)
-  pipelined_writes(0).valid := io.write.fire()
+  pipelined_writes(0).valid := io.write.fire
   pipelined_writes(0).bits  := io.write.bits
   for (i <- 1 until acc_latency) {
     pipelined_writes(i) := pipelined_writes(i-1)
@@ -148,8 +143,8 @@ class AccumulatorMem[T <: Data, U <: Data](
     mem.io.mask := oldest_pipelined_write.bits.mask
     rdata_for_adder := mem.io.rdata
     rdata_for_read_resp := mem.io.rdata
-    mem.io.raddr := Mux(io.write.fire() && io.write.bits.acc, io.write.bits.addr, io.read.req.bits.addr)
-    mem.io.ren := io.read.req.fire() || (io.write.fire() && io.write.bits.acc)
+    mem.io.raddr := Mux(io.write.fire && io.write.bits.acc, io.write.bits.addr, io.read.req.bits.addr)
+    mem.io.ren := io.read.req.fire || (io.write.fire && io.write.bits.acc)
   } else {
     val rmw_req = Wire(Decoupled(UInt()))
     rmw_req.valid := io.write.valid && io.write.bits.acc
@@ -203,14 +198,13 @@ class AccumulatorMem[T <: Data, U <: Data](
         val data = Vec(mask_len, mask_elem)
         val mask = Vec(mask_len, Bool())
         val addr = UInt(log2Ceil(n/acc_sub_banks).W)
-        override def cloneType: this.type = new W_Q_Entry(mask_len, mask_elem).asInstanceOf[this.type]
       }
 
       val w_q = Reg(Vec(nEntries, new W_Q_Entry(mask_len, mask_elem)))
       for (e <- w_q) {
         when (e.valid) {
           assert(!(
-            io.write.fire() && io.write.bits.acc &&
+            io.write.fire && io.write.bits.acc &&
             isThisBank(io.write.bits.addr) && getBankIdx(io.write.bits.addr) === e.addr &&
             ((io.write.bits.mask.asUInt & e.mask.asUInt) =/= 0.U)
           ), "you cannot accumulate to an AccumulatorMem address until previous writes to that address have completed")
@@ -276,7 +270,7 @@ class AccumulatorMem[T <: Data, U <: Data](
       //   1. incoming reads for RMW
       //   2. writes from RMW
       //   3. incoming reads
-      when (rmw_req.fire() && isThisBank(rmw_req.bits)) {
+      when (rmw_req.fire && isThisBank(rmw_req.bits)) {
         ren := true.B
         when (isThisBank(only_read_req.bits)) {
           only_read_req.ready := false.B
@@ -287,7 +281,7 @@ class AccumulatorMem[T <: Data, U <: Data](
           only_read_req.ready := false.B
         }
       } .otherwise {
-        ren := isThisBank(only_read_req.bits) && only_read_req.fire()
+        ren := isThisBank(only_read_req.bits) && only_read_req.fire
         raddr := getBankIdx(only_read_req.bits)
       }
 
@@ -304,7 +298,7 @@ class AccumulatorMem[T <: Data, U <: Data](
   q.io.enq.bits.act := RegNext(io.read.req.bits.act)
   q.io.enq.bits.fromDMA := RegNext(io.read.req.bits.fromDMA)
   q.io.enq.bits.acc_bank_id := DontCare
-  q.io.enq.valid := RegNext(io.read.req.fire())
+  q.io.enq.valid := RegNext(io.read.req.fire)
 
   val p = q.io.deq
 
@@ -317,7 +311,7 @@ class AccumulatorMem[T <: Data, U <: Data](
   io.read.resp.valid := p.valid
   p.ready := io.read.resp.ready
 
-  val q_will_be_empty = (q.io.count +& q.io.enq.fire()) - q.io.deq.fire() === 0.U
+  val q_will_be_empty = (q.io.count +& q.io.enq.fire) - q.io.deq.fire === 0.U
   io.read.req.ready := q_will_be_empty && (
       // Make sure we aren't accumulating, which would take over both ports
       !(io.write.valid && io.write.bits.acc) &&
@@ -333,5 +327,5 @@ class AccumulatorMem[T <: Data, U <: Data](
   }
 
   // assert(!(io.read.req.valid && io.write.en && io.write.acc), "reading and accumulating simultaneously is not supported")
-  assert(!(io.read.req.fire() && io.write.fire() && io.read.req.bits.addr === io.write.bits.addr), "reading from and writing to same address is not supported")
+  assert(!(io.read.req.fire && io.write.fire && io.read.req.bits.addr === io.write.bits.addr), "reading from and writing to same address is not supported")
 }
diff --git a/src/main/scala/gemmini/AccumulatorScale.scala b/src/main/scala/gemmini/AccumulatorScale.scala
index 5e4997f8..2d23af1d 100644
--- a/src/main/scala/gemmini/AccumulatorScale.scala
+++ b/src/main/scala/gemmini/AccumulatorScale.scala
@@ -8,7 +8,6 @@ import Util._
 class AccumulatorReadRespWithFullData[T <: Data: Arithmetic, U <: Data](fullDataType: Vec[Vec[T]], scale_t: U, shift_width: Int) extends Bundle {
   val resp = new AccumulatorReadResp(fullDataType, scale_t, shift_width)
   val full_data = fullDataType.cloneType
-  override def cloneType: this.type = new AccumulatorReadRespWithFullData(fullDataType.cloneType, scale_t, shift_width).asInstanceOf[this.type]
 }
 
 
@@ -17,7 +16,6 @@ class AccumulatorScaleResp[T <: Data: Arithmetic](fullDataType: Vec[Vec[T]], rDa
   val data = rDataType.cloneType
   val acc_bank_id = UInt(2.W)
   val fromDMA = Bool()
-  override def cloneType: this.type = new AccumulatorScaleResp(fullDataType, rDataType).asInstanceOf[this.type]
 }
 
 class AccumulatorScaleIO[T <: Data: Arithmetic, U <: Data](
@@ -26,8 +24,6 @@ class AccumulatorScaleIO[T <: Data: Arithmetic, U <: Data](
 ) extends Bundle {
   val in = Flipped(Decoupled(new AccumulatorReadResp[T,U](fullDataType, scale_t, shift_width)))
   val out = Decoupled(new AccumulatorScaleResp[T](fullDataType, rDataType))
-  override def cloneType: this.type = new AccumulatorScaleIO(fullDataType, scale_t,
-    shift_width, rDataType).asInstanceOf[this.type]
 }
 
 class AccScaleDataWithIndex[T <: Data: Arithmetic, U <: Data](t: T, u: U) extends Bundle {
@@ -40,7 +36,6 @@ class AccScaleDataWithIndex[T <: Data: Arithmetic, U <: Data](t: T, u: U) extend
   val full_data = t.cloneType
   val id = UInt(2.W) // TODO hardcoded
   val index = UInt()
-  override def cloneType: this.type = new AccScaleDataWithIndex(t, u).asInstanceOf[this.type]
 }
 
 class AccScalePipe[T <: Data : Arithmetic, U <: Data](t: T, rDataType: Vec[Vec[T]], scale_func: (T, U) => T, scale_t: U, latency: Int, has_nonlinear_activations: Boolean)(implicit ev: Arithmetic[T]) extends Module {
@@ -123,7 +118,7 @@ class AccumulatorScale[T <: Data: Arithmetic, U <: Data](
     val tail_oh = RegInit(1.U(nEntries.W))
     out.valid := Mux1H(head_oh.asBools, (regs zip completed_masks).map({case (r, c) => r.valid && c.reduce(_&&_)}))
     out.bits  := Mux1H(head_oh.asBools, out_regs)
-    when (out.fire()) {
+    when (out.fire) {
       for (i <- 0 until nEntries) {
         when (head_oh(i)) {
           regs(i).valid := false.B
@@ -132,8 +127,8 @@ class AccumulatorScale[T <: Data: Arithmetic, U <: Data](
       head_oh := (head_oh << 1) | head_oh(nEntries-1)
     }
 
-    io.in.ready := !Mux1H(tail_oh.asBools, regs.map(_.valid)) || (tail_oh === head_oh && out.fire())
-    when (io.in.fire()) {
+    io.in.ready := !Mux1H(tail_oh.asBools, regs.map(_.valid)) || (tail_oh === head_oh && out.fire)
+    when (io.in.fire) {
       for (i <- 0 until nEntries) {
         when (tail_oh(i)) {
           regs(i).valid := true.B
@@ -160,7 +155,7 @@ class AccumulatorScale[T <: Data: Arithmetic, U <: Data](
         input.bits.relu6_shift := regs(i).bits.relu6_shift
         input.bits.id := i.U
         input.bits.index := w.U
-        when (input.fire()) {
+        when (input.fire) {
           fired_masks(i)(w) := true.B
         }
       }
@@ -185,7 +180,7 @@ class AccumulatorScale[T <: Data: Arithmetic, U <: Data](
           if ((j*width+w) % num_scale_units == i) {
             val id0 = w % io.in.bits.data(0).size
             val id1 = w / io.in.bits.data(0).size
-            when (pipe_out.fire() && pipe_out.bits.id === j.U && pipe_out.bits.index === w.U) {
+            when (pipe_out.fire && pipe_out.bits.id === j.U && pipe_out.bits.index === w.U) {
               out_regs(j).data     (id1)(id0) := pipe_out.bits.data
               out_regs(j).full_data(id1)(id0) := pipe_out.bits.full_data
               completed_masks(j)(w) := true.B
diff --git a/src/main/scala/gemmini/BeatMerger.scala b/src/main/scala/gemmini/BeatMerger.scala
index ac7b284f..a845327b 100644
--- a/src/main/scala/gemmini/BeatMerger.scala
+++ b/src/main/scala/gemmini/BeatMerger.scala
@@ -61,7 +61,7 @@ class BeatMerger[U <: Data](beatBits: Int, maxShift: Int, spadWidth: Int, accWid
 
   io.req.ready := !req.valid
 
-  io.in.ready := io.req.fire() || (req.valid && bytesRead =/= (1.U << req.bits.lg_len_req).asUInt())
+  io.in.ready := io.req.fire || (req.valid && bytesRead =/= (1.U << req.bits.lg_len_req).asUInt())
 
   io.out.valid := req.valid && usefulBytesRead > bytesSent && (usefulBytesRead - bytesSent >= rowBytes ||
     usefulBytesRead === req.bits.bytes_to_read)
@@ -92,7 +92,7 @@ class BeatMerger[U <: Data](beatBits: Int, maxShift: Int, spadWidth: Int, accWid
     req.pop()
   }
 
-  when (io.out.fire()) {
+  when (io.out.fire) {
     bytesSent := bytesSent_next
 
     when (last_sending && bytesRead === (1.U << req.bits.lg_len_req).asUInt()) {
@@ -101,18 +101,18 @@ class BeatMerger[U <: Data](beatBits: Int, maxShift: Int, spadWidth: Int, accWid
     }
   }
 
-  when (io.req.fire()) {
+  when (io.req.fire) {
     req.push(io.req.bits)
     bytesRead := 0.U
     bytesSent := 0.U
   }
 
-  when (io.in.fire()) {
-    val current_bytesRead = Mux(io.req.fire(), 0.U, bytesRead)
-    val current_bytesDiscarded = Mux(io.req.fire(), 0.U, bytesDiscarded)
-    val current_usefulBytesRead = Mux(io.req.fire(), 0.U, usefulBytesRead)
-    val current_shift = Mux(io.req.fire(), io.req.bits.shift, req.bits.shift)
-    val current_lg_len_req = Mux(io.req.fire(), io.req.bits.lg_len_req, req.bits.lg_len_req)
+  when (io.in.fire) {
+    val current_bytesRead = Mux(io.req.fire, 0.U, bytesRead)
+    val current_bytesDiscarded = Mux(io.req.fire, 0.U, bytesDiscarded)
+    val current_usefulBytesRead = Mux(io.req.fire, 0.U, usefulBytesRead)
+    val current_shift = Mux(io.req.fire, io.req.bits.shift, req.bits.shift)
+    val current_lg_len_req = Mux(io.req.fire, io.req.bits.lg_len_req, req.bits.lg_len_req)
     val current_len_req = (1.U << current_lg_len_req).asUInt()
 
     when (current_shift - current_bytesDiscarded <= beatBytes.U /* &&
@@ -127,7 +127,7 @@ class BeatMerger[U <: Data](beatBits: Int, maxShift: Int, spadWidth: Int, accWid
 
     bytesRead := satAdd(current_bytesRead, beatBytes.U, current_len_req)
 
-    when (!io.req.fire() && bytesSent === req.bits.bytes_to_read && last_reading) {
+    when (!io.req.fire && bytesSent === req.bits.bytes_to_read && last_reading) {
       req.pop()
     }
   }
diff --git a/src/main/scala/gemmini/CmdFSM.scala b/src/main/scala/gemmini/CmdFSM.scala
index 5a3b51f9..8ee3a696 100644
--- a/src/main/scala/gemmini/CmdFSM.scala
+++ b/src/main/scala/gemmini/CmdFSM.scala
@@ -64,7 +64,7 @@ class CmdFSM[T <: Data: Arithmetic, U <: Data, V <: Data]
   status := DontCare
 
   //==========================================================================
-  // Combinational Output Defaults 
+  // Combinational Output Defaults
   //==========================================================================
   io.cmd.ready         := false.B
   io.tiler.valid       := false.B
@@ -90,7 +90,7 @@ class CmdFSM[T <: Data: Arithmetic, U <: Data, V <: Data]
   io.busy := (state === s_EX_PENDING)
 
   //==========================================================================
-  // FSM 
+  // FSM
   //==========================================================================
   def reset_and_listen(): Unit = {
     // Reset all data-validity
@@ -109,13 +109,13 @@ class CmdFSM[T <: Data: Arithmetic, U <: Data, V <: Data]
     // Wait for tiling/ execution to complete,
     // let any further commands queue up
     io.tiler.valid := true.B
-    when (io.tiler.fire()) {
+    when (io.tiler.fire) {
       state := s_LISTENING
     }
   }.elsewhen (state === s_ERROR) {
     // In s_ERROR state - only update based on RESET commands
     io.cmd.ready := true.B
-    when (io.cmd.fire()) {
+    when (io.cmd.fire) {
       val cmd = io.cmd.bits
       val funct = cmd.inst.funct
       when (funct === RESET) {
@@ -124,7 +124,7 @@ class CmdFSM[T <: Data: Arithmetic, U <: Data, V <: Data]
     }
   }.otherwise { // s_LISTENING State
     io.cmd.ready := true.B
-    when (io.cmd.fire()) {
+    when (io.cmd.fire) {
       val cmd = io.cmd.bits
       val funct = cmd.inst.funct
       val rs1 = cmd.rs1
@@ -143,7 +143,7 @@ class CmdFSM[T <: Data: Arithmetic, U <: Data, V <: Data]
         // Signal to the Tiler, and move to our EXEC state
         // FIXME: check all valid
         io.tiler.valid := true.B
-        when (io.tiler.fire()) {
+        when (io.tiler.fire) {
           state := s_LISTENING
         }.otherwise {
           state := s_EX_PENDING
diff --git a/src/main/scala/gemmini/Controller.scala b/src/main/scala/gemmini/Controller.scala
index 74f23b4c..d1019ef1 100644
--- a/src/main/scala/gemmini/Controller.scala
+++ b/src/main/scala/gemmini/Controller.scala
@@ -18,7 +18,6 @@ class GemminiCmd(rob_entries: Int)(implicit p: Parameters) extends Bundle {
   val cmd = new RoCCCommand
   val rob_id = UDValid(UInt(log2Up(rob_entries).W))
 
-  override def cloneType: this.type = new GemminiCmd(rob_entries).asInstanceOf[this.type]
 }
 
 class Gemmini[T <: Data : Arithmetic, U <: Data, V <: Data](val config: GemminiArrayConfig[T, U, V])
@@ -389,7 +388,7 @@ class GemminiModule[T <: Data: Arithmetic, U <: Data, V <: Data]
     .otherwise {
       reservation_station.io.alloc.valid := true.B
 
-      when(reservation_station.io.alloc.fire()) {
+      when(reservation_station.io.alloc.fire) {
         // compressed_cmd.ready := true.B
         unrolled_cmd.ready := true.B
       }
@@ -414,5 +413,5 @@ class GemminiModule[T <: Data: Arithmetic, U <: Data, V <: Data]
   //=========================================================================
   // Performance Counters Access
   //=========================================================================
-  
+
 }
diff --git a/src/main/scala/gemmini/CounterFile.scala b/src/main/scala/gemmini/CounterFile.scala
index 9f0482f7..35f50c20 100644
--- a/src/main/scala/gemmini/CounterFile.scala
+++ b/src/main/scala/gemmini/CounterFile.scala
@@ -155,7 +155,7 @@ class CounterIO(nPerfCounter: Int, counterWidth: Int) extends Bundle {
 
 // A simple counter file. Every counter is incremented when the corresponding event signal is high on rising edge.
 // There are two type of counters: Built-in counters and external counters. External counters have their value
-// stored in other modules and can incremented by arbitary values. 
+// stored in other modules and can incremented by arbitary values.
 class CounterFile(nPerfCounter: Int, counterWidth: Int) extends Module
 {
   val io = IO(new CounterIO(nPerfCounter, counterWidth))
@@ -182,8 +182,8 @@ class CounterFile(nPerfCounter: Int, counterWidth: Int) extends Module
     }
     // Snapshot: In case a sequence of access instructions get interrupted (i.e. preempted by OS), it is possible
     // to take a snapshot when reading counter value by setting a bit in the instruction. All subsequent readings
-    // return the values from the snapshot until it is cleared by a instruction with "clear" bit marked. 
-    // When the snapshot bit is set, the normal counters are still being incremented. 
+    // return the values from the snapshot until it is cleared by a instruction with "clear" bit marked.
+    // When the snapshot bit is set, the normal counters are still being incremented.
     when (io.snapshot_reset) {
       snapshot_enable := false.B
     } .elsewhen (io.snapshot) {
@@ -227,7 +227,7 @@ class CounterController(nPerfCounter: Int, counterWidth: Int)(implicit p: Parame
 
     val module = Module(new CounterFile(nPerfCounter: Int, counterWidth: Int))
     module.io.event_io <> io.event_io
-    
+
     val out_reg = Reg(io.out.bits.cloneType)
     val out_valid_reg = RegInit(false.B)
 
@@ -243,16 +243,16 @@ class CounterController(nPerfCounter: Int, counterWidth: Int)(implicit p: Parame
 
     io.in.ready := !out_valid_reg
     module.io.addr := io.in.bits.rs1(nCounterIndexBit + 3, 4)
-    module.io.counter_reset := io.in.bits.rs1(0) & io.in.fire()
-    module.io.snapshot_reset := io.in.bits.rs1(1) & io.in.fire()
-    module.io.snapshot := io.in.bits.rs1(2) & io.in.fire()
-    module.io.config_address.valid := io.in.bits.rs1(3) & io.in.fire()
+    module.io.counter_reset := io.in.bits.rs1(0) & io.in.fire
+    module.io.snapshot_reset := io.in.bits.rs1(1) & io.in.fire
+    module.io.snapshot := io.in.bits.rs1(2) & io.in.fire
+    module.io.config_address.valid := io.in.bits.rs1(3) & io.in.fire
     module.io.config_address.bits := io.in.bits.rs1(17, 12)
     module.io.external := io.in.bits.rs1(31)
 
-    when (io.out.fire()) {
+    when (io.out.fire) {
       out_valid_reg := false.B
-    } .elsewhen (io.in.fire()) {
+    } .elsewhen (io.in.fire) {
       out_valid_reg := true.B
       out_reg.rd := io.in.bits.inst.rd
       out_reg.data := 0.U
@@ -264,4 +264,4 @@ class CounterController(nPerfCounter: Int, counterWidth: Int)(implicit p: Parame
   } else {
     io <> DontCare
   }
-}
\ No newline at end of file
+}
diff --git a/src/main/scala/gemmini/DMA.scala b/src/main/scala/gemmini/DMA.scala
index 5952be5b..9761228f 100644
--- a/src/main/scala/gemmini/DMA.scala
+++ b/src/main/scala/gemmini/DMA.scala
@@ -31,7 +31,6 @@ class StreamReadRequest[U <: Data](spad_rows: Int, acc_rows: Int, mvin_scale_t_b
   val block_stride = UInt(16.W) // TODO magic number
   val cmd_id = UInt(8.W) // TODO magic number
 
-  override def cloneType: StreamReadRequest.this.type = new StreamReadRequest(spad_rows, acc_rows, mvin_scale_t_bits).asInstanceOf[this.type]
 }
 
 class StreamReadResponse[U <: Data](spadWidth: Int, accWidth: Int, spad_rows: Int, acc_rows: Int, aligned_to: Int, mvin_scale_t_bits: Int)
@@ -50,7 +49,6 @@ class StreamReadResponse[U <: Data](spadWidth: Int, accWidth: Int, spad_rows: In
   val bytes_read = UInt(8.W) // TODO magic number
   val cmd_id = UInt(8.W) // TODO magic number
 
-  override def cloneType: StreamReadResponse.this.type = new StreamReadResponse(spadWidth, accWidth, spad_rows, acc_rows, aligned_to, mvin_scale_t_bits).asInstanceOf[this.type]
 }
 
 class StreamReader[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T, U, V], nXacts: Int, beatBits: Int, maxBytes: Int, spadWidth: Int, accWidth: Int, aligned_to: Int,
@@ -83,8 +81,8 @@ class StreamReader[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T
     core.module.io.flush := io.flush
 
     xactTracker.io.alloc <> core.module.io.reserve
-    xactTracker.io.peek.xactid := RegEnableThru(core.module.io.beatData.bits.xactid, beatPacker.io.req.fire())
-    xactTracker.io.peek.pop := beatPacker.io.in.fire() && core.module.io.beatData.bits.last
+    xactTracker.io.peek.xactid := RegEnableThru(core.module.io.beatData.bits.xactid, beatPacker.io.req.fire)
+    xactTracker.io.peek.pop := beatPacker.io.in.fire && core.module.io.beatData.bits.last
 
     core.module.io.beatData.ready := beatPacker.io.in.ready
     beatPacker.io.req.valid := core.module.io.beatData.valid
@@ -101,12 +99,12 @@ class StreamReader[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T
     io.resp.bits.is_acc := beatPacker.io.out.bits.is_acc
     io.resp.bits.accumulate := beatPacker.io.out.bits.accumulate
     io.resp.bits.has_acc_bitwidth := beatPacker.io.out.bits.has_acc_bitwidth
-    io.resp.bits.scale := RegEnable(xactTracker.io.peek.entry.scale, beatPacker.io.req.fire())
-    io.resp.bits.repeats := RegEnable(xactTracker.io.peek.entry.repeats, beatPacker.io.req.fire())
-    io.resp.bits.pixel_repeats := RegEnable(xactTracker.io.peek.entry.pixel_repeats, beatPacker.io.req.fire())
-    io.resp.bits.len := RegEnable(xactTracker.io.peek.entry.len, beatPacker.io.req.fire())
-    io.resp.bits.cmd_id := RegEnable(xactTracker.io.peek.entry.cmd_id, beatPacker.io.req.fire())
-    io.resp.bits.bytes_read := RegEnable(xactTracker.io.peek.entry.bytes_to_read, beatPacker.io.req.fire())
+    io.resp.bits.scale := RegEnable(xactTracker.io.peek.entry.scale, beatPacker.io.req.fire)
+    io.resp.bits.repeats := RegEnable(xactTracker.io.peek.entry.repeats, beatPacker.io.req.fire)
+    io.resp.bits.pixel_repeats := RegEnable(xactTracker.io.peek.entry.pixel_repeats, beatPacker.io.req.fire)
+    io.resp.bits.len := RegEnable(xactTracker.io.peek.entry.len, beatPacker.io.req.fire)
+    io.resp.bits.cmd_id := RegEnable(xactTracker.io.peek.entry.cmd_id, beatPacker.io.req.fire)
+    io.resp.bits.bytes_read := RegEnable(xactTracker.io.peek.entry.bytes_to_read, beatPacker.io.req.fire)
     io.resp.bits.last := beatPacker.io.out.bits.last
 
     io.counter.collect(core.module.io.counter)
@@ -270,7 +268,7 @@ class StreamReaderCore[T <: Data, U <: Data, V <: Data](config: GemminiArrayConf
         if (bytesRequested.getWidth >= log2Up(spadWidthBytes+1)) bytesRequested / spadWidthBytes.U else 0.U)
     io.reserve.entry.spad_row_offset := Mux(req.has_acc_bitwidth, bytesRequested % accWidthBytes.U, bytesRequested % spadWidthBytes.U)
 
-    when (untranslated_a.fire()) {
+    when (untranslated_a.fire) {
       val next_vaddr = req.vaddr + read_bytes_read // send_size
       val new_page = next_vaddr(pgIdxBits-1, 0) === 0.U
       req.vaddr := next_vaddr
@@ -295,7 +293,7 @@ class StreamReaderCore[T <: Data, U <: Data, V <: Data](config: GemminiArrayConf
     // TODO the size data is already returned from TileLink, so there's no need for us to store it in the XactTracker ourselves
 
     // Accepting requests to kick-start the state machine
-    when (io.req.fire()) {
+    when (io.req.fire) {
       req := io.req.bits
       bytesRequested := 0.U
 
@@ -312,7 +310,7 @@ class StreamReaderCore[T <: Data, U <: Data, V <: Data](config: GemminiArrayConf
     val total_bytes_read = RegInit(0.U(CounterExternal.EXTERNAL_WIDTH.W))
     when (io.counter.external_reset) {
       total_bytes_read := 0.U
-    }.elsewhen (tl.d.fire()) {
+    }.elsewhen (tl.d.fire) {
       total_bytes_read := total_bytes_read + (1.U << tl.d.bits.size)
     }
 
@@ -390,7 +388,7 @@ class StreamWriter[T <: Data: Arithmetic](nXacts: Int, beatBits: Int, maxBytes:
 
     val xactBusy_fire = WireInit(false.B)
     val xactBusy_add = Mux(xactBusy_fire, (1.U << xactId).asUInt(), 0.U)
-    val xactBusy_remove = ~Mux(tl.d.fire(), (1.U << tl.d.bits.source).asUInt(), 0.U)
+    val xactBusy_remove = ~Mux(tl.d.fire, (1.U << tl.d.bits.source).asUInt(), 0.U)
     xactBusy := (xactBusy | xactBusy_add) & xactBusy_remove.asUInt()
 
     val state_machine_ready_for_req = WireInit(state === s_idle)
@@ -502,7 +500,7 @@ class StreamWriter[T <: Data: Arithmetic](nXacts: Int, beatBits: Int, maxBytes:
     }
 
     val untranslated_a = Wire(Decoupled(new TLBundleAWithInfo))
-    xactBusy_fire := untranslated_a.fire() && state === s_writing_new_block
+    xactBusy_fire := untranslated_a.fire && state === s_writing_new_block
     untranslated_a.valid := (state === s_writing_new_block || state === s_writing_beats) && !xactBusy.andR()
     untranslated_a.bits.tl_a := Mux(write_full, putFull, putPartial)
     untranslated_a.bits.vaddr := write_vaddr
@@ -521,7 +519,7 @@ class StreamWriter[T <: Data: Arithmetic](nXacts: Int, beatBits: Int, maxBytes:
     val tlb_q = Module(new Queue(new TLBundleAWithInfo, 1, pipe=true))
     tlb_q.io.enq <> tlb_arb.io.out
 
-    io.tlb.req.valid := tlb_q.io.deq.fire()
+    io.tlb.req.valid := tlb_q.io.deq.fire
     io.tlb.req.bits.tlb_req.vaddr := tlb_q.io.deq.bits.vaddr
     io.tlb.req.bits.tlb_req.passthrough := false.B
     io.tlb.req.bits.tlb_req.size := 0.U // send_size
@@ -543,11 +541,11 @@ class StreamWriter[T <: Data: Arithmetic](nXacts: Int, beatBits: Int, maxBytes:
 
     tl.a.valid := translate_q.io.deq.valid && !io.tlb.resp.miss
     tl.a.bits := translate_q.io.deq.bits.tl_a
-    tl.a.bits.address := RegEnableThru(io.tlb.resp.paddr, RegNext(io.tlb.req.fire()))
+    tl.a.bits.address := RegEnableThru(io.tlb.resp.paddr, RegNext(io.tlb.req.fire))
 
     tl.d.ready := xactBusy.orR()
 
-    when (untranslated_a.fire()) {
+    when (untranslated_a.fire) {
       when (state === s_writing_new_block) {
         beatsLeft := write_beats - 1.U
 
@@ -584,7 +582,7 @@ class StreamWriter[T <: Data: Arithmetic](nXacts: Int, beatBits: Int, maxBytes:
     }
 
     // Accepting requests to kick-start the state machine
-    when (io.req.fire()) {
+    when (io.req.fire) {
       val pooled = {
         val cols = dataWidth / inputType.getWidth
         val v1 = io.req.bits.data.asTypeOf(Vec(cols, inputType))
@@ -615,7 +613,7 @@ class StreamWriter[T <: Data: Arithmetic](nXacts: Int, beatBits: Int, maxBytes:
 
     // External counters
     val total_bytes_sent = RegInit(0.U(CounterExternal.EXTERNAL_WIDTH.W))
-    when (tl.d.fire()) {
+    when (tl.d.fire) {
       total_bytes_sent := total_bytes_sent + (1.U << tl.d.bits.size)
     }
 
diff --git a/src/main/scala/gemmini/DMACommandTracker.scala b/src/main/scala/gemmini/DMACommandTracker.scala
index a687e918..3390cbdf 100644
--- a/src/main/scala/gemmini/DMACommandTracker.scala
+++ b/src/main/scala/gemmini/DMACommandTracker.scala
@@ -21,7 +21,6 @@ class DMACommandTracker[T <: Data](val nCmds: Int, val maxBytes: Int, tag_t: =>
         val bytes_to_read = Input(UInt(log2Up(maxBytes+1).W))
         val cmd_id = Output(cmd_id_t.cloneType)
 
-        override def cloneType: this.type = new BitsT(tag_t.cloneType, cmd_id_t.cloneType).asInstanceOf[this.type]
       }
 
       val bits = new BitsT(tag_t.cloneType, cmd_id_t.cloneType)
@@ -34,7 +33,6 @@ class DMACommandTracker[T <: Data](val nCmds: Int, val maxBytes: Int, tag_t: =>
       val bytes_read = UInt(log2Up(maxBytes+1).W)
       val cmd_id = cmd_id_t.cloneType
 
-      override def cloneType: this.type = new RequestReturnedT(cmd_id_t.cloneType).asInstanceOf[this.type]
     }
 
     val request_returned = Flipped(Valid(new RequestReturnedT(cmd_id_t.cloneType)))
@@ -43,7 +41,6 @@ class DMACommandTracker[T <: Data](val nCmds: Int, val maxBytes: Int, tag_t: =>
       val cmd_id = cmd_id_t.cloneType
       val tag = tag_t.cloneType
 
-      override def cloneType: this.type = new CmdCompletedT(cmd_id_t.cloneType, tag_t.cloneType).asInstanceOf[this.type]
     }
 
     val cmd_completed = Decoupled(new CmdCompletedT(cmd_id_t.cloneType, tag_t.cloneType))
@@ -85,7 +82,7 @@ class DMACommandTracker[T <: Data](val nCmds: Int, val maxBytes: Int, tag_t: =>
     cmds(next_empty_alloc).bytes_left := io.alloc.bits.bytes_to_read
   }
 
-  when (io.request_returned.fire()) {
+  when (io.request_returned.fire) {
     val cmd_id = io.request_returned.bits.cmd_id
     cmds(cmd_id).bytes_left := cmds(cmd_id).bytes_left - io.request_returned.bits.bytes_read
 
@@ -93,7 +90,7 @@ class DMACommandTracker[T <: Data](val nCmds: Int, val maxBytes: Int, tag_t: =>
     assert(cmds(cmd_id).bytes_left >= io.request_returned.bits.bytes_read)
   }
 
-  when (io.cmd_completed.fire()) {
+  when (io.cmd_completed.fire) {
     cmds(io.cmd_completed.bits.cmd_id).valid := false.B
   }
 
diff --git a/src/main/scala/gemmini/ExecuteController.scala b/src/main/scala/gemmini/ExecuteController.scala
index 6891c09b..de85a31b 100644
--- a/src/main/scala/gemmini/ExecuteController.scala
+++ b/src/main/scala/gemmini/ExecuteController.scala
@@ -682,7 +682,7 @@ class ExecuteController[T <: Data, U <: Data, V <: Data](xLen: Int, tagWidth: In
       }
     }
     is(flush) {
-      when(mesh.io.req.fire()) {
+      when(mesh.io.req.fire) {
         control_state := flushing
       }
     }
@@ -810,9 +810,9 @@ class ExecuteController[T <: Data, U <: Data, V <: Data](xLen: Int, tagWidth: In
   val accReadValid = VecInit(io.acc.read_resp.map(bank => ex_read_from_acc.B && bank.valid && !bank.bits.fromDMA))
   val im2ColValid = io.im2col.resp.valid
 
-  mesh_cntl_signals_q.io.deq.ready := (!cntl.a_fire || mesh.io.a.fire() || !mesh.io.a.ready) &&
-    (!cntl.b_fire || mesh.io.b.fire() || !mesh.io.b.ready) &&
-    (!cntl.d_fire || mesh.io.d.fire() || !mesh.io.d.ready) &&
+  mesh_cntl_signals_q.io.deq.ready := (!cntl.a_fire || mesh.io.a.fire || !mesh.io.a.ready) &&
+    (!cntl.b_fire || mesh.io.b.fire || !mesh.io.b.ready) &&
+    (!cntl.d_fire || mesh.io.d.fire || !mesh.io.d.ready) &&
     (!cntl.first || mesh.io.req.ready)
 
   val dataA_valid = cntl.a_garbage || cntl.a_unpadded_cols === 0.U || Mux(cntl.im2colling, im2ColValid, Mux(cntl.a_read_from_acc, accReadValid(cntl.a_bank_acc), readValid(cntl.a_bank)))
@@ -840,8 +840,8 @@ class ExecuteController[T <: Data, U <: Data, V <: Data](xLen: Int, tagWidth: In
   val dataD = VecInit(dataD_unpadded.asTypeOf(Vec(block_size, inputType)).zipWithIndex.map { case (d, i) => Mux(i.U < cntl.d_unpadded_cols, d, inputType.zero)})
 
   // Pop responses off the scratchpad io ports
-  when (mesh_cntl_signals_q.io.deq.fire()) {
-    when (cntl.a_fire && mesh.io.a.fire() && !cntl.a_garbage && cntl.a_unpadded_cols > 0.U && !cntl.im2colling) {
+  when (mesh_cntl_signals_q.io.deq.fire) {
+    when (cntl.a_fire && mesh.io.a.fire && !cntl.a_garbage && cntl.a_unpadded_cols > 0.U && !cntl.im2colling) {
       when (cntl.a_read_from_acc) {
         io.acc.read_resp(cntl.a_bank_acc).ready := !io.acc.read_resp(cntl.a_bank_acc).bits.fromDMA
       }.otherwise {
@@ -849,7 +849,7 @@ class ExecuteController[T <: Data, U <: Data, V <: Data](xLen: Int, tagWidth: In
       }
     }
 
-    when (cntl.b_fire && mesh.io.b.fire() && !cntl.b_garbage && !cntl.accumulate_zeros && cntl.b_unpadded_cols > 0.U) {
+    when (cntl.b_fire && mesh.io.b.fire && !cntl.b_garbage && !cntl.accumulate_zeros && cntl.b_unpadded_cols > 0.U) {
       when (cntl.b_read_from_acc) {
         io.acc.read_resp(cntl.b_bank_acc).ready := !io.acc.read_resp(cntl.b_bank_acc).bits.fromDMA
       }.otherwise {
@@ -857,7 +857,7 @@ class ExecuteController[T <: Data, U <: Data, V <: Data](xLen: Int, tagWidth: In
       }
     }
 
-    when (cntl.d_fire && mesh.io.d.fire() && !cntl.d_garbage && !cntl.preload_zeros && cntl.d_unpadded_cols > 0.U) {
+    when (cntl.d_fire && mesh.io.d.fire && !cntl.d_garbage && !cntl.preload_zeros && cntl.d_unpadded_cols > 0.U) {
       when (cntl.d_read_from_acc) {
         io.acc.read_resp(cntl.d_bank_acc).ready := !io.acc.read_resp(cntl.d_bank_acc).bits.fromDMA
       }.otherwise {
@@ -882,7 +882,7 @@ class ExecuteController[T <: Data, U <: Data, V <: Data](xLen: Int, tagWidth: In
     mesh.io.b.bits := dataB.asTypeOf(Vec(meshColumns, Vec(tileColumns, inputType)))
     mesh.io.d.bits := dataD.asTypeOf(Vec(meshColumns, Vec(tileColumns, inputType)))
 
-    mesh.io.req.valid := mesh_cntl_signals_q.io.deq.fire() && (cntl.a_fire || cntl.b_fire || cntl.d_fire)
+    mesh.io.req.valid := mesh_cntl_signals_q.io.deq.fire && (cntl.a_fire || cntl.b_fire || cntl.d_fire)
 
     mesh.io.req.bits.tag.addr := cntl.c_addr
 
@@ -970,7 +970,7 @@ class ExecuteController[T <: Data, U <: Data, V <: Data](xLen: Int, tagWidth: In
   //val complete_lock = RegInit(false.B)
 
   //Seah: added for WS accumulator
-  when(mesh.io.resp.fire() && mesh.io.resp.bits.tag.rob_id.valid) {
+  when(mesh.io.resp.fire && mesh.io.resp.bits.tag.rob_id.valid) {
     output_counter := wrappingAdd(output_counter, 1.U, w_total_output_rows)
     val last = mesh.io.resp.bits.last
 
@@ -1005,29 +1005,29 @@ class ExecuteController[T <: Data, U <: Data, V <: Data](xLen: Int, tagWidth: In
   // Performance counter
   CounterEventIO.init(io.counter)
   io.counter.connectEventSignal(CounterEvent.EXE_ACTIVE_CYCLE, control_state === compute)
-  io.counter.connectEventSignal(CounterEvent.EXE_FLUSH_CYCLE, 
+  io.counter.connectEventSignal(CounterEvent.EXE_FLUSH_CYCLE,
     control_state === flushing || control_state === flush)
-  io.counter.connectEventSignal(CounterEvent.EXE_CONTROL_Q_BLOCK_CYCLE, 
+  io.counter.connectEventSignal(CounterEvent.EXE_CONTROL_Q_BLOCK_CYCLE,
     !mesh_cntl_signals_q.io.enq.ready && mesh_cntl_signals_q.io.enq.valid)
-  io.counter.connectEventSignal(CounterEvent.EXE_PRELOAD_HAZ_CYCLE, 
+  io.counter.connectEventSignal(CounterEvent.EXE_PRELOAD_HAZ_CYCLE,
     cmd.valid(0) && DoPreloads(0) && cmd.valid(1) && raw_hazard_pre)
-  io.counter.connectEventSignal(CounterEvent.EXE_OVERLAP_HAZ_CYCLE, 
+  io.counter.connectEventSignal(CounterEvent.EXE_OVERLAP_HAZ_CYCLE,
     cmd.valid(0) && DoPreloads(1) && cmd.valid(1) && DoComputes(0) && cmd.valid(2) && raw_hazard_mulpre)
   io.counter.connectEventSignal(CounterEvent.A_GARBAGE_CYCLES, cntl.a_garbage)
   io.counter.connectEventSignal(CounterEvent.B_GARBAGE_CYCLES, cntl.b_garbage)
   io.counter.connectEventSignal(CounterEvent.D_GARBAGE_CYCLES, cntl.d_garbage)
-  io.counter.connectEventSignal(CounterEvent.ACC_A_WAIT_CYCLE, 
-    !(!cntl.a_fire || mesh.io.a.fire() || !mesh.io.a.ready) && cntl.a_read_from_acc && !cntl.im2colling)
-  io.counter.connectEventSignal(CounterEvent.ACC_B_WAIT_CYCLE, 
-    !(!cntl.b_fire || mesh.io.b.fire() || !mesh.io.b.ready) && cntl.b_read_from_acc)
-  io.counter.connectEventSignal(CounterEvent.ACC_D_WAIT_CYCLE, 
-    !(!cntl.d_fire || mesh.io.d.fire() || !mesh.io.d.ready) && cntl.d_read_from_acc)
-  io.counter.connectEventSignal(CounterEvent.SCRATCHPAD_A_WAIT_CYCLE, 
-    !(!cntl.a_fire || mesh.io.a.fire() || !mesh.io.a.ready) && !cntl.a_read_from_acc && !cntl.im2colling)
-  io.counter.connectEventSignal(CounterEvent.SCRATCHPAD_B_WAIT_CYCLE, 
-    !(!cntl.b_fire || mesh.io.b.fire() || !mesh.io.b.ready) && !cntl.b_read_from_acc)
-  io.counter.connectEventSignal(CounterEvent.SCRATCHPAD_D_WAIT_CYCLE, 
-    !(!cntl.d_fire || mesh.io.d.fire() || !mesh.io.d.ready) && !cntl.d_read_from_acc)
+  io.counter.connectEventSignal(CounterEvent.ACC_A_WAIT_CYCLE,
+    !(!cntl.a_fire || mesh.io.a.fire || !mesh.io.a.ready) && cntl.a_read_from_acc && !cntl.im2colling)
+  io.counter.connectEventSignal(CounterEvent.ACC_B_WAIT_CYCLE,
+    !(!cntl.b_fire || mesh.io.b.fire || !mesh.io.b.ready) && cntl.b_read_from_acc)
+  io.counter.connectEventSignal(CounterEvent.ACC_D_WAIT_CYCLE,
+    !(!cntl.d_fire || mesh.io.d.fire || !mesh.io.d.ready) && cntl.d_read_from_acc)
+  io.counter.connectEventSignal(CounterEvent.SCRATCHPAD_A_WAIT_CYCLE,
+    !(!cntl.a_fire || mesh.io.a.fire || !mesh.io.a.ready) && !cntl.a_read_from_acc && !cntl.im2colling)
+  io.counter.connectEventSignal(CounterEvent.SCRATCHPAD_B_WAIT_CYCLE,
+    !(!cntl.b_fire || mesh.io.b.fire || !mesh.io.b.ready) && !cntl.b_read_from_acc)
+  io.counter.connectEventSignal(CounterEvent.SCRATCHPAD_D_WAIT_CYCLE,
+    !(!cntl.d_fire || mesh.io.d.fire || !mesh.io.d.ready) && !cntl.d_read_from_acc)
 
   if (use_firesim_simulation_counters) {
     val ex_flush_cycle = control_state === flushing || control_state === flush
diff --git a/src/main/scala/gemmini/FrontendTLB.scala b/src/main/scala/gemmini/FrontendTLB.scala
index bc028ee9..6e7168e9 100644
--- a/src/main/scala/gemmini/FrontendTLB.scala
+++ b/src/main/scala/gemmini/FrontendTLB.scala
@@ -54,25 +54,27 @@ class DecoupledTLB(entries: Int, maxSize: Int, use_firesim_simulation_counters:
   tlb.io.sfence.bits.rs2 := false.B
   tlb.io.sfence.bits.addr := DontCare
   tlb.io.sfence.bits.asid := DontCare
+  tlb.io.sfence.bits.hv := false.B
+  tlb.io.sfence.bits.hg := false.B
 
   io.ptw <> tlb.io.ptw
   tlb.io.ptw.status := io.req.bits.status
   val exception = io.req.valid && Mux(io.req.bits.tlb_req.cmd === M_XRD, tlb.io.resp.pf.ld || tlb.io.resp.ae.ld, tlb.io.resp.pf.st || tlb.io.resp.ae.st)
   when (exception) { interrupt := true.B }
-  when (interrupt && tlb.io.sfence.fire()) {
+  when (interrupt && tlb.io.sfence.fire) {
     interrupt := false.B
   }
 
   assert(!io.exp.flush_retry || !io.exp.flush_skip, "TLB: flushing with both retry and skip at same time")
 
   CounterEventIO.init(io.counter)
-  io.counter.connectEventSignal(CounterEvent.DMA_TLB_HIT_REQ, io.req.fire() && !tlb.io.resp.miss)
-  io.counter.connectEventSignal(CounterEvent.DMA_TLB_TOTAL_REQ, io.req.fire())
+  io.counter.connectEventSignal(CounterEvent.DMA_TLB_HIT_REQ, io.req.fire && !tlb.io.resp.miss)
+  io.counter.connectEventSignal(CounterEvent.DMA_TLB_TOTAL_REQ, io.req.fire)
   io.counter.connectEventSignal(CounterEvent.DMA_TLB_MISS_CYCLE, tlb.io.resp.miss)
 
   if (use_firesim_simulation_counters) {
-    PerfCounter(io.req.fire() && !tlb.io.resp.miss, "tlb_hits", "total number of tlb hits")
-    PerfCounter(io.req.fire(), "tlb_reqs", "total number of tlb reqs")
+    PerfCounter(io.req.fire && !tlb.io.resp.miss, "tlb_hits", "total number of tlb hits")
+    PerfCounter(io.req.fire, "tlb_reqs", "total number of tlb reqs")
     PerfCounter(tlb.io.resp.miss, "tlb_miss_cycles", "total number of cycles where the tlb is resolving a miss")
   }
 }
@@ -123,7 +125,7 @@ class FrontendTLB(nClients: Int, entries: Int, maxSize: Int, use_tlb_register_fi
     val tlb = if (use_shared_tlb) tlbs.head else tlbs(i)
     val tlbReq = if (use_shared_tlb) tlbArbOpt.get.io.in(i).bits else tlb.io.req.bits
     val tlbReqValid = if (use_shared_tlb) tlbArbOpt.get.io.in(i).valid else tlb.io.req.valid
-    val tlbReqFire = if (use_shared_tlb) tlbArbOpt.get.io.in(i).fire() else tlb.io.req.fire()
+    val tlbReqFire = if (use_shared_tlb) tlbArbOpt.get.io.in(i).fire else tlb.io.req.fire
 
     tlbReqValid := RegNext(client.req.valid && !l0_tlb_hit)
     tlbReq := RegNext(client.req.bits)
diff --git a/src/main/scala/gemmini/GemminiISA.scala b/src/main/scala/gemmini/GemminiISA.scala
index 0b28316d..9cb15ac9 100644
--- a/src/main/scala/gemmini/GemminiISA.scala
+++ b/src/main/scala/gemmini/GemminiISA.scala
@@ -71,9 +71,6 @@ object GemminiISA {
     val num_cols = UInt(mvin_cols_bits.W)
     val _spacer0 = UInt((MVIN_RS2_ADDR_WIDTH - local_addr_t.getWidth).W)
     val local_addr = local_addr_t.cloneType
-
-    override def cloneType: MvinRs2.this.type =
-      (new MvinRs2(mvin_rows_bits, mvin_cols_bits, local_addr_t)).asInstanceOf[this.type]
   }
 
   val MVOUT_RS2_ADDR_WIDTH = 32
@@ -87,9 +84,6 @@ object GemminiISA {
     val num_cols = UInt(mvout_cols_bits.W)
     val _spacer0 = UInt((MVOUT_RS2_ADDR_WIDTH - local_addr_t.getWidth).W)
     val local_addr = local_addr_t.cloneType
-
-    override def cloneType: MvoutRs2.this.type =
-      (new MvoutRs2(mvout_rows_bits, mvout_cols_bits, local_addr_t)).asInstanceOf[this.type]
   }
 
   val CONFIG_MVIN_RS1_UNUSED_WIDTH = 2
@@ -111,9 +105,6 @@ object GemminiISA {
     val state_id = UInt(CONFIG_MVIN_RS1_STATE_ID_WIDTH.W)
     val shrink = UInt(CONFIG_MVIN_RS1_SHRINK_WIDTH.W)
     val _unused = UInt(CONFIG_MVIN_RS1_UNUSED_WIDTH.W)
-
-    override def cloneType: ConfigMvinRs1.this.type =
-      (new ConfigMvinRs1(scale_bits, stride_bits, pixel_repeat_bits)).asInstanceOf[this.type]
   }
 
   val CONFIG_MVOUT_RS1_UNUSED_WIDTH = 2
@@ -142,8 +133,6 @@ object GemminiISA {
     val pool_stride = UInt(CONFIG_MVOUT_RS1_MAX_POOLING_STRIDE_WIDTH.W)
     val activation = UInt(CONFIG_MVOUT_RS1_ACTIVATION_WIDTH.W)
     val _unused = UInt(CONFIG_MVOUT_RS1_UNUSED_WIDTH.W)
-
-    override def cloneType: ConfigMvoutRs1.this.type = (new ConfigMvoutRs1).asInstanceOf[this.type]
   }
 
   val CONFIG_MVOUT_RS2_ACC_SCALE_WIDTH = 32
@@ -154,9 +143,6 @@ object GemminiISA {
     val acc_scale = UInt(acc_scale_bits.W)
     val _spacer0 = UInt((CONFIG_MVOUT_RS2_STRIDE_WIDTH - stride_bits).W)
     val stride = UInt(stride_bits.W)
-
-    override def cloneType: ConfigMvoutRs2.this.type =
-      (new ConfigMvoutRs2(acc_scale_bits, stride_bits)).asInstanceOf[this.type]
   }
 
   val CONFIG_EX_RS1_CMD_TYPE_WIDTH = 2
@@ -182,9 +168,6 @@ object GemminiISA {
     val activation = UInt(CONFIG_EX_RS1_ACTIVATION_WIDTH.W)
     val dataflow = UInt(CONFIG_EX_RS1_DATAFLOW_WIDTH.W)
     val cmd_type = UInt(CONFIG_EX_RS1_CMD_TYPE_WIDTH.W)
-
-    override def cloneType: ConfigExRs1.this.type =
-      (new ConfigExRs1(acc_scale_bits)).asInstanceOf[this.type]
   }
 
   val CONFIG_EX_RS2_IN_SHIFT_WIDTH = 32
@@ -195,8 +178,6 @@ object GemminiISA {
     val c_stride = UInt(CONFIG_EX_RS2_C_STRIDE_WIDTH.W)
     val relu6_shift = UInt(CONFIG_EX_RS2_RELU6_SHIFT_WIDTH.W)
     val in_shift = UInt(CONFIG_EX_RS2_IN_SHIFT_WIDTH.W)
-
-    override def cloneType: ConfigExRs2.this.type = (new ConfigExRs2).asInstanceOf[this.type]
   }
 
   val PRELOAD_RS_ADDR_WIDTH = 32
@@ -210,9 +191,6 @@ object GemminiISA {
     val num_cols = UInt(preload_cols_bits.W)
     val _spacer0 = UInt((PRELOAD_RS_ADDR_WIDTH - local_addr_t.getWidth).W)
     val local_addr = local_addr_t.cloneType
-
-    override def cloneType: PreloadRs.this.type =
-      (new PreloadRs(preload_rows_bits, preload_cols_bits, local_addr_t)).asInstanceOf[this.type]
   }
 
   val COMPUTED_RS_ADDR_WIDTH = 32
@@ -226,9 +204,6 @@ object GemminiISA {
     val num_cols = UInt(compute_cols_bits.W)
     val _spacer0 = UInt((COMPUTED_RS_ADDR_WIDTH - local_addr_t.getWidth).W)
     val local_addr = local_addr_t.cloneType
-
-    override def cloneType: ComputeRs.this.type =
-      (new ComputeRs(compute_rows_bits, compute_cols_bits, local_addr_t)).asInstanceOf[this.type]
   }
 }
 
diff --git a/src/main/scala/gemmini/Im2Col.scala b/src/main/scala/gemmini/Im2Col.scala
index 2c7f8cbf..a317902b 100644
--- a/src/main/scala/gemmini/Im2Col.scala
+++ b/src/main/scala/gemmini/Im2Col.scala
@@ -24,7 +24,6 @@ class Im2ColReadReq[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[
   val weight_triple_bank = Bool()
   val start_inputting = Bool() //start_inputting_a
 
-  override def cloneType: Im2ColReadReq.this.type = new Im2ColReadReq(config).asInstanceOf[this.type]
 
 }
 
@@ -38,7 +37,6 @@ class Im2ColReadResp[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig
   //added for sync
   val im2col_delay = Bool()
 
-  override def cloneType: Im2ColReadResp.this.type = new Im2ColReadResp(config).asInstanceOf[this.type]
 
 }
 
diff --git a/src/main/scala/gemmini/InstructionCompression.scala b/src/main/scala/gemmini/InstructionCompression.scala
index 38f373e5..96bc77ee 100644
--- a/src/main/scala/gemmini/InstructionCompression.scala
+++ b/src/main/scala/gemmini/InstructionCompression.scala
@@ -25,17 +25,17 @@ class InstCompressor(implicit p: Parameters) extends Module {
   fused_cmd.rs1 := Cat(buf(0).bits.rs1(31, 0), buf(1).bits.rs1(31, 0))
   fused_cmd.rs2 := Cat(buf(0).bits.rs2(31, 0), buf(1).bits.rs2(31, 0))
 
-  io.in.ready := !buf(0).valid || (buf(0).valid && is_preload && !buf(1).valid) || io.out.fire()
+  io.in.ready := !buf(0).valid || (buf(0).valid && is_preload && !buf(1).valid) || io.out.fire
   io.out.valid := (buf(0).valid && !is_preload) || (buf(0).valid && is_preload && buf(1).valid)
   io.out.bits := Mux(is_preload, fused_cmd, buf(0).bits)
 
   io.busy := buf(0).valid
 
-  when (io.out.fire()) {
+  when (io.out.fire) {
     buf.foreach(_.pop())
   }
 
-  when (io.in.fire()) {
+  when (io.in.fire) {
     val waddr = Mux(buf(0).valid && is_preload && !buf(1).valid, 1.U, 0.U)
     buf(waddr).push(io.in.bits)
   }
@@ -62,11 +62,11 @@ class InstDecompressor(rob_entries: Int)(implicit p: Parameters) extends Module
   unfused_cmd.cmd.rs1 := Mux(pushed_preload, cmd.rs1(31, 0), cmd.rs1(63, 32))
   unfused_cmd.cmd.rs2 := Mux(pushed_preload, cmd.rs2(31, 0), cmd.rs2(63, 32))
 
-  io.in.ready := !buf.valid || (io.out.fire() && !(is_compute && !pushed_preload))
+  io.in.ready := !buf.valid || (io.out.fire && !(is_compute && !pushed_preload))
   io.out.valid := buf.valid
   io.out.bits := Mux(is_compute, unfused_cmd, buf.bits)
 
-  when (io.out.fire()) {
+  when (io.out.fire) {
     when (is_compute && !pushed_preload) {
       pushed_preload := true.B
     }.otherwise {
@@ -74,7 +74,7 @@ class InstDecompressor(rob_entries: Int)(implicit p: Parameters) extends Module
     }
   }
 
-  when (io.in.fire()) {
+  when (io.in.fire) {
     buf.push(io.in.bits)
     pushed_preload := false.B
   }
diff --git a/src/main/scala/gemmini/LoadController.scala b/src/main/scala/gemmini/LoadController.scala
index 49d7b409..2ebee2ca 100644
--- a/src/main/scala/gemmini/LoadController.scala
+++ b/src/main/scala/gemmini/LoadController.scala
@@ -116,7 +116,7 @@ class LoadController[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig
     Mux(io.dma.req.bits.has_acc_bitwidth, cols * actual_rows_read * config.accType.getWidth.U,
       cols * actual_rows_read * config.inputType.getWidth.U) / 8.U
   cmd_tracker.io.alloc.bits.tag.rob_id := cmd.bits.rob_id.bits
-  cmd_tracker.io.request_returned.valid := io.dma.resp.fire() // TODO use a bundle connect
+  cmd_tracker.io.request_returned.valid := io.dma.resp.fire // TODO use a bundle connect
   cmd_tracker.io.request_returned.bits.cmd_id := io.dma.resp.bits.cmd_id // TODO use a bundle connect
   cmd_tracker.io.request_returned.bits.bytes_read := io.dma.resp.bits.bytesRead
   cmd_tracker.io.cmd_completed.ready := io.completed.ready
@@ -130,7 +130,7 @@ class LoadController[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig
   io.busy := cmd.valid || cmd_tracker.io.busy
 
   // Row counter
-  when (io.dma.req.fire()) {
+  when (io.dma.req.fire) {
     row_counter := wrappingAdd(row_counter, 1.U, actual_rows_read)
 
     assert(block_stride >= rows)
@@ -150,19 +150,19 @@ class LoadController[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig
         }
 
         .elsewhen(DoLoad && cmd_tracker.io.alloc.fire()) {
-          control_state := Mux(io.dma.req.fire(), sending_rows, waiting_for_dma_req_ready)
+          control_state := Mux(io.dma.req.fire, sending_rows, waiting_for_dma_req_ready)
         }
       }
     }
 
     is (waiting_for_dma_req_ready) {
-      when (io.dma.req.fire()) {
+      when (io.dma.req.fire) {
         control_state := sending_rows
       }
     }
 
     is (sending_rows) {
-      val last_row = row_counter === 0.U || (row_counter === actual_rows_read-1.U && io.dma.req.fire())
+      val last_row = row_counter === 0.U || (row_counter === actual_rows_read-1.U && io.dma.req.fire)
 
       when (last_row) {
         control_state := waiting_for_command
diff --git a/src/main/scala/gemmini/LocalAddr.scala b/src/main/scala/gemmini/LocalAddr.scala
index d6e4f309..92e46ffc 100644
--- a/src/main/scala/gemmini/LocalAddr.scala
+++ b/src/main/scala/gemmini/LocalAddr.scala
@@ -94,7 +94,6 @@ class LocalAddr(sp_banks: Int, sp_bank_entries: Int, acc_banks: Int, acc_bank_en
     data := ~(0.U(maxAddrBits.W))
   }
 
-  override def cloneType: LocalAddr.this.type = new LocalAddr(sp_banks, sp_bank_entries, acc_banks, acc_bank_entries).asInstanceOf[this.type]
 }
 
 object LocalAddr {
diff --git a/src/main/scala/gemmini/LoopConv.scala b/src/main/scala/gemmini/LoopConv.scala
index cfb9bd8d..a50cc9ac 100644
--- a/src/main/scala/gemmini/LoopConv.scala
+++ b/src/main/scala/gemmini/LoopConv.scala
@@ -182,7 +182,7 @@ class LoopConvLdBias(block_size: Int, coreMaxAddrBits: Int, large_iterator_bitwi
   // Sending outputs
   when (skip) {
     state := idle
-  }.elsewhen(command_p.io.in.fire()) {
+  }.elsewhen(command_p.io.in.fire) {
     when (state === config) {
       state := ld
     }.otherwise {
@@ -202,7 +202,7 @@ class LoopConvLdBias(block_size: Int, coreMaxAddrBits: Int, large_iterator_bitwi
   }
 
   // Accepting requests
-  when (io.req.fire()) {
+  when (io.req.fire) {
     req := io.req.bits
     state := config
     b := 0.U
@@ -353,7 +353,7 @@ class LoopConvLdInput(block_size: Int, coreMaxAddrBits: Int, large_iterator_bitw
   }
 
   // Sending outputs
-  when(command_p.io.in.fire()) {
+  when(command_p.io.in.fire) {
     when (state === config) {
       state := ld
     }.otherwise {
@@ -379,7 +379,7 @@ class LoopConvLdInput(block_size: Int, coreMaxAddrBits: Int, large_iterator_bitw
   }
 
   // Accepting requests
-  when (io.req.fire()) {
+  when (io.req.fire) {
     req := io.req.bits
     state := config
     b := 0.S
@@ -527,7 +527,7 @@ class LoopConvLdWeight(block_size: Int, coreMaxAddrBits: Int, large_iterator_bit
   }
 
   // Sending outputs
-  when(command_p.io.in.fire()) {
+  when(command_p.io.in.fire) {
     when (state === config) {
       state := ld
     }.otherwise {
@@ -550,7 +550,7 @@ class LoopConvLdWeight(block_size: Int, coreMaxAddrBits: Int, large_iterator_bit
   }
 
   // Accepting requests
-  when (io.req.fire()) {
+  when (io.req.fire) {
     req := io.req.bits
     state := config
     kch := 0.U
@@ -759,12 +759,12 @@ class LoopConvExecute(block_size: Int, large_iterator_bitwidth: Int, small_itera
   }
 
   // Updating "new_weights"
-  when (state === comp && command_p.io.in.fire()) {
+  when (state === comp && command_p.io.in.fire) {
     new_weights := false.B
   }
 
   // Sending outputs
-  when (command_p.io.in.fire() || skip_iteration) {
+  when (command_p.io.in.fire || skip_iteration) {
     when (state === config) {
       state := pre
     }.elsewhen (state === pre) {
@@ -804,7 +804,7 @@ class LoopConvExecute(block_size: Int, large_iterator_bitwidth: Int, small_itera
   }
 
   // Accepting requests
-  when (io.req.fire()) {
+  when (io.req.fire) {
     req := io.req.bits
     state := Mux(io.req.bits.trans_input_3120, config, pre)
 
@@ -998,7 +998,7 @@ class LoopConvSt(block_size: Int, coreMaxAddrBits: Int, large_iterator_bitwidth:
   // Sending outputs
   when (skip) {
     state := idle
-  }.elsewhen(command_p.io.in.fire()) {
+  }.elsewhen(command_p.io.in.fire) {
     when (req.no_pool) {
       val next_och = floorAdd(och, block_size.U, ochs)
       val next_ocol = floorAdd(ocol, block_size.U, ocols, next_och === 0.U)
@@ -1029,7 +1029,7 @@ class LoopConvSt(block_size: Int, coreMaxAddrBits: Int, large_iterator_bitwidth:
   }
 
   // Accepting requests
-  when (io.req.fire()) {
+  when (io.req.fire) {
     req := io.req.bits
     state := Mux(io.req.bits.no_pool, st, pre_pool_config)
 
@@ -1345,7 +1345,7 @@ class LoopConv (block_size: Int, coreMaxAddrBits: Int, rob_size: Int, max_lds: I
 
   ld_bias.io.req.valid := !loop_requesting_ld_bias.ld_bias_started && loop_requesting_ld_bias.configured
 
-  when (ld_bias.io.req.fire()) {
+  when (ld_bias.io.req.fire) {
     loop_requesting_ld_bias.running := true.B
     loop_requesting_ld_bias.ld_bias_started := true.B
 
@@ -1370,7 +1370,7 @@ class LoopConv (block_size: Int, coreMaxAddrBits: Int, rob_size: Int, max_lds: I
 
   ld_input.io.req.valid := !loop_requesting_ld_input.ld_input_started && loop_requesting_ld_input.configured
 
-  when (ld_input.io.req.fire()) {
+  when (ld_input.io.req.fire) {
     loop_requesting_ld_input.running := true.B
     loop_requesting_ld_input.ld_input_started := true.B
   }
@@ -1388,7 +1388,7 @@ class LoopConv (block_size: Int, coreMaxAddrBits: Int, rob_size: Int, max_lds: I
 
   ld_weights.io.req.valid := !loop_requesting_ld_weights.ld_weights_started && loop_requesting_ld_weights.configured
 
-  when (ld_weights.io.req.fire()) {
+  when (ld_weights.io.req.fire) {
     loop_requesting_ld_weights.running := true.B
     loop_requesting_ld_weights.ld_weights_started := true.B
   }
@@ -1412,7 +1412,7 @@ class LoopConv (block_size: Int, coreMaxAddrBits: Int, rob_size: Int, max_lds: I
   ex.io.req.valid := !loop_requesting_ex.ex_started && loop_requesting_ex.ld_bias_started &&
     loop_requesting_ex.ld_input_started && loop_requesting_ex.ld_weights_started && loop_requesting_ex.configured
 
-  when (ex.io.req.fire()) {
+  when (ex.io.req.fire) {
     loop_requesting_ex.running := true.B
     loop_requesting_ex.ex_started := true.B
 
@@ -1435,7 +1435,7 @@ class LoopConv (block_size: Int, coreMaxAddrBits: Int, rob_size: Int, max_lds: I
 
   st.io.req.valid := !loop_requesting_st.st_started && loop_requesting_st.ex_started && loop_requesting_st.configured
 
-  when (st.io.req.fire()) {
+  when (st.io.req.fire) {
     loop_requesting_st.running := true.B
     loop_requesting_st.st_started := true.B
 
diff --git a/src/main/scala/gemmini/LoopMatmul.scala b/src/main/scala/gemmini/LoopMatmul.scala
index 4841e2be..80ece4fc 100644
--- a/src/main/scala/gemmini/LoopMatmul.scala
+++ b/src/main/scala/gemmini/LoopMatmul.scala
@@ -90,7 +90,7 @@ class LoopMatmulLdA(block_size: Int, coreMaxAddrBits: Int, iterator_bitwidth: In
 
   io.loop_id := req.loop_id
 
-  when (io.cmd.fire()) {
+  when (io.cmd.fire) {
     // The order here is k, j, i
     val i_blocks = Mux(req.transpose, max_blocks, 1.U)
     val k_blocks = Mux(req.transpose, 1.U, max_blocks)
@@ -106,7 +106,7 @@ class LoopMatmulLdA(block_size: Int, coreMaxAddrBits: Int, iterator_bitwidth: In
     }
   }
 
-  when (io.req.fire()) {
+  when (io.req.fire) {
     req := io.req.bits
     state := ld
     i := 0.U
@@ -198,7 +198,7 @@ class LoopMatmulLdB(block_size: Int, coreMaxAddrBits: Int, iterator_bitwidth: In
 
   io.loop_id := req.loop_id
 
-  when (io.cmd.fire()) {
+  when (io.cmd.fire) {
     // The order here is k, j, i
     val j_blocks = Mux(req.transpose, 1.U, max_blocks)
     val k_blocks = Mux(req.transpose, max_blocks, 1.U)
@@ -214,7 +214,7 @@ class LoopMatmulLdB(block_size: Int, coreMaxAddrBits: Int, iterator_bitwidth: In
     }
   }
 
-  when (io.req.fire()) {
+  when (io.req.fire) {
     req := io.req.bits
     state := ld
     j := 0.U
@@ -296,7 +296,7 @@ class LoopMatmulLdD(block_size: Int, coreMaxAddrBits: Int, iterator_bitwidth: In
 
   when (req.dram_addr === 0.U) {
     state := idle
-  }.elsewhen (io.cmd.fire()) {
+  }.elsewhen (io.cmd.fire) {
     // The order here is k, j, i
     val next_i = floorAdd(i, 1.U, req.max_i)
     val next_j = floorAdd(j, max_blocks, req.max_j, next_i === 0.U)
@@ -309,7 +309,7 @@ class LoopMatmulLdD(block_size: Int, coreMaxAddrBits: Int, iterator_bitwidth: In
     }
   }
 
-  when (io.req.fire()) {
+  when (io.req.fire) {
     req := io.req.bits
     state := ld
     j := 0.U
@@ -450,7 +450,7 @@ class LoopMatmulExecute(block_size: Int, coreMaxAddrBits: Int, iterator_bitwidth
 
   io.loop_id := req.loop_id
 
-  when (io.cmd.fire()) {
+  when (io.cmd.fire) {
     when (state === pre) {
       state := comp
     }.otherwise {
@@ -466,7 +466,7 @@ class LoopMatmulExecute(block_size: Int, coreMaxAddrBits: Int, iterator_bitwidth
     }
   }
 
-  when (io.req.fire()) {
+  when (io.req.fire) {
     req := io.req.bits
     state := pre
     j := 0.U
@@ -566,7 +566,7 @@ class LoopMatmulStC(block_size: Int, coreMaxAddrBits: Int, iterator_bitwidth: In
 
   when (req.dram_addr === 0.U) {
     state := idle
-  }.elsewhen (io.cmd.fire()) {
+  }.elsewhen (io.cmd.fire) {
     // The order here is k, j, i
     val next_i = floorAdd(i, 1.U, req.max_i)
     val next_j = floorAdd(j, max_blocks, req.max_j, next_i === 0.U)
@@ -579,7 +579,7 @@ class LoopMatmulStC(block_size: Int, coreMaxAddrBits: Int, iterator_bitwidth: In
     }
   }
 
-  when (io.req.fire()) {
+  when (io.req.fire) {
     req := io.req.bits
     state := st
     j := 0.U
@@ -827,7 +827,7 @@ class LoopMatmul(block_size: Int, coreMaxAddrBits: Int, rob_size: Int, max_lds:
 
   ldA.io.req.valid := !loop_requesting_ldA.lda_started && loop_requesting_ldA.configured
 
-  when (ldA.io.req.fire()) {
+  when (ldA.io.req.fire) {
     loop_requesting_ldA.running := true.B
     loop_requesting_ldA.lda_started := true.B
   }
@@ -846,7 +846,7 @@ class LoopMatmul(block_size: Int, coreMaxAddrBits: Int, rob_size: Int, max_lds:
 
   ldB.io.req.valid := !loop_requesting_ldB.ldb_started && loop_requesting_ldB.configured
 
-  when (ldB.io.req.fire()) {
+  when (ldB.io.req.fire) {
     loop_requesting_ldB.running := true.B
     loop_requesting_ldB.ldb_started := true.B
   }
@@ -870,7 +870,7 @@ class LoopMatmul(block_size: Int, coreMaxAddrBits: Int, rob_size: Int, max_lds:
   ex.io.req.valid := !loop_requesting_ex.ex_started && loop_requesting_ex.lda_started &&
     loop_requesting_ex.ldb_started && loop_requesting_ex.ldd_started && loop_requesting_ex.configured
 
-  when (ex.io.req.fire()) {
+  when (ex.io.req.fire) {
     loop_requesting_ex.running := true.B
     loop_requesting_ex.ex_started := true.B
 
@@ -893,7 +893,7 @@ class LoopMatmul(block_size: Int, coreMaxAddrBits: Int, rob_size: Int, max_lds:
 
   ldD.io.req.valid := !loop_requesting_ldD.ldd_started && loop_requesting_ldD.configured
 
-  when (ldD.io.req.fire()) {
+  when (ldD.io.req.fire) {
     loop_requesting_ldD.running := true.B
     loop_requesting_ldD.ldd_started := true.B
 
@@ -917,7 +917,7 @@ class LoopMatmul(block_size: Int, coreMaxAddrBits: Int, rob_size: Int, max_lds:
 
   stC.io.req.valid := !loop_requesting_st.st_started && loop_requesting_st.ex_started && loop_requesting_st.configured
 
-  when (stC.io.req.fire()) {
+  when (stC.io.req.fire) {
     loop_requesting_st.running := true.B
     loop_requesting_st.st_started := true.B
 
diff --git a/src/main/scala/gemmini/LoopUnroller.scala b/src/main/scala/gemmini/LoopUnroller.scala
index 47c33f68..02ac7d71 100644
--- a/src/main/scala/gemmini/LoopUnroller.scala
+++ b/src/main/scala/gemmini/LoopUnroller.scala
@@ -84,11 +84,11 @@ class LoopUnroller(block_size: Int)(implicit p: Parameters) extends Module {
 
   when (cmd.valid) {
     when (is_loop && (state === idle || state === preload)) {
-      when (io.out.fire()) {
+      when (io.out.fire) {
         state := compute
       }
     }.elsewhen(is_loop && state === compute) {
-      when (io.out.fire()) {
+      when (io.out.fire) {
         increment()
         state := Mux(last_iteration, idle, preload)
         cmd.ready := last_iteration
diff --git a/src/main/scala/gemmini/MeshWithDelays.scala b/src/main/scala/gemmini/MeshWithDelays.scala
index db40debf..edd28cf6 100644
--- a/src/main/scala/gemmini/MeshWithDelays.scala
+++ b/src/main/scala/gemmini/MeshWithDelays.scala
@@ -14,7 +14,6 @@ class MeshWithDelaysReq[T <: Data: Arithmetic, TagT <: TagQueueTag with Data](ac
   val tag = tagType
   val flush = UInt(2.W) // TODO magic number
 
-  override def cloneType: MeshWithDelaysReq.this.type = new MeshWithDelaysReq(accType, tagType, block_size).asInstanceOf[this.type]
 }
 
 class MeshWithDelaysResp[T <: Data: Arithmetic, TagT <: TagQueueTag with Data](outType: T, meshCols: Int, tileCols: Int, block_size: Int, tagType: TagT) extends Bundle {
@@ -23,7 +22,6 @@ class MeshWithDelaysResp[T <: Data: Arithmetic, TagT <: TagQueueTag with Data](o
   val tag = tagType
   val last = Bool()
 
-  override def cloneType: MeshWithDelaysResp.this.type = new MeshWithDelaysResp(outType, meshCols, tileCols, block_size, tagType).asInstanceOf[this.type]
 }
 
 // TODO Add io.out.ready back in. Before it was removed, it didn't work when banking, and it seemed to assume that SRAM outputs stay steady when ren is low
@@ -99,9 +97,9 @@ class MeshWithDelays[T <: Data: Arithmetic, U <: TagQueueTag with Data]
   val total_fires = req.bits.total_rows
   val fire_counter = RegInit(0.U(log2Up(block_size).W))
 
-  val a_buf = RegEnable(io.a.bits, io.a.fire())
-  val b_buf = RegEnable(io.b.bits, io.b.fire())
-  val d_buf = RegEnable(io.d.bits, io.d.fire())
+  val a_buf = RegEnable(io.a.bits, io.a.fire)
+  val b_buf = RegEnable(io.b.bits, io.b.fire)
+  val d_buf = RegEnable(io.d.bits, io.d.fire)
 
   val a_written = RegInit(false.B)
   val b_written = RegInit(false.B)
@@ -113,7 +111,7 @@ class MeshWithDelays[T <: Data: Arithmetic, U <: TagQueueTag with Data]
 
   val last_fire = fire_counter === total_fires - 1.U && input_next_row_into_spatial_array
 
-  when (io.req.fire()) {
+  when (io.req.fire) {
     req.push(io.req.bits)
     in_prop := io.req.bits.pe_control.propagate ^ in_prop
     matmul_id := wrappingAdd(matmul_id, 1.U, max_simultaneous_matmuls)
@@ -130,15 +128,15 @@ class MeshWithDelays[T <: Data: Arithmetic, U <: TagQueueTag with Data]
     fire_counter := wrappingAdd(fire_counter, 1.U, total_fires)
   }
 
-  when (io.a.fire()) {
+  when (io.a.fire) {
     a_written := true.B
   }
 
-  when (io.b.fire()) {
+  when (io.b.fire) {
     b_written := true.B
   }
 
-  when (io.d.fire()) {
+  when (io.d.fire) {
     d_written := true.B
   }
 
@@ -216,14 +214,13 @@ class MeshWithDelays[T <: Data: Arithmetic, U <: TagQueueTag with Data]
       tag.make_this_garbage()
     }
 
-    override def cloneType: TagWithIdAndTotalRows.this.type = (new TagWithIdAndTotalRows).asInstanceOf[this.type]
   }
 
   val matmul_id_of_output = wrappingAdd(matmul_id, Mux(io.req.bits.pe_control.dataflow === Dataflow.OS.id.U, 3.U, 2.U), max_simultaneous_matmuls)
   val matmul_id_of_current = wrappingAdd(matmul_id, 1.U, max_simultaneous_matmuls)
 
   val tagq = Module(new TagQueue(new TagWithIdAndTotalRows, tagqlen))
-  tagq.io.enq.valid := io.req.fire() && io.req.bits.flush === 0.U
+  tagq.io.enq.valid := io.req.fire && io.req.bits.flush === 0.U
   tagq.io.enq.bits.tag := io.req.bits.tag
   tagq.io.enq.bits.total_rows := DontCare
   tagq.io.enq.bits.id := matmul_id_of_output
@@ -240,7 +237,7 @@ class MeshWithDelays[T <: Data: Arithmetic, U <: TagQueueTag with Data]
   tagq.io.deq.ready := io.resp.valid && io.resp.bits.last && out_matmul_id === tagq.io.deq.bits.id
 
   val total_rows_q = Module(new Queue(new TagWithIdAndTotalRows, tagqlen))
-  total_rows_q.io.enq.valid := io.req.fire() && io.req.bits.flush === 0.U
+  total_rows_q.io.enq.valid := io.req.fire && io.req.bits.flush === 0.U
   total_rows_q.io.enq.bits.tag := DontCare
   total_rows_q.io.enq.bits.total_rows := io.req.bits.total_rows
   total_rows_q.io.enq.bits.id := matmul_id_of_current
@@ -257,5 +254,5 @@ class MeshWithDelays[T <: Data: Arithmetic, U <: TagQueueTag with Data]
     req.valid := false.B
   }
 
-  assert(!(io.req.fire() && !tagq.io.enq.ready && io.req.bits.flush === 0.U))
+  assert(!(io.req.fire && !tagq.io.enq.ready && io.req.bits.flush === 0.U))
 }
diff --git a/src/main/scala/gemmini/MultiHeadedQueue.scala b/src/main/scala/gemmini/MultiHeadedQueue.scala
index c029f6f5..79900dfa 100644
--- a/src/main/scala/gemmini/MultiHeadedQueue.scala
+++ b/src/main/scala/gemmini/MultiHeadedQueue.scala
@@ -33,7 +33,7 @@ class MultiHeadedQueue[T <: Data](gen: T, entries: Int, heads: Int, maxpop: Int
   }
 
   // Pushing
-  when (io.enq.fire()) {
+  when (io.enq.fire) {
     regs(waddr) := io.enq.bits
     waddr := wrappingAdd(waddr, 1.U, entries)
     len := len + 1.U
@@ -42,7 +42,7 @@ class MultiHeadedQueue[T <: Data](gen: T, entries: Int, heads: Int, maxpop: Int
   // Popping
   when(io.deq.pop > 0.U) {
     raddr := wrappingAdd(raddr, io.deq.pop, entries)
-    len := len - io.deq.pop + io.enq.fire()
+    len := len - io.deq.pop + io.enq.fire
   }
 
   assert(io.deq.pop <= len && io.deq.pop <= heads.U && io.deq.pop <= maxpop.U)
diff --git a/src/main/scala/gemmini/MultiTailedQueue.scala b/src/main/scala/gemmini/MultiTailedQueue.scala
index ea16728b..7a0bb3d7 100644
--- a/src/main/scala/gemmini/MultiTailedQueue.scala
+++ b/src/main/scala/gemmini/MultiTailedQueue.scala
@@ -4,7 +4,7 @@ import chisel3._
 import chisel3.util._
 import Util._
 
-class MultiTailedQueue[T <: Data](gen: T, entries: Int, maxpush: Int) 
+class MultiTailedQueue[T <: Data](gen: T, entries: Int, maxpush: Int)
   extends Module {
   val io = IO(new Bundle {
     val enq = new Bundle {
@@ -36,10 +36,10 @@ class MultiTailedQueue[T <: Data](gen: T, entries: Int, maxpush: Int)
   // pop interface
   io.deq.bits := regs(raddr)
   io.deq.valid := (avail < entries.U)
-  raddr := wrappingAdd(raddr, io.deq.fire(), entries)
+  raddr := wrappingAdd(raddr, io.deq.fire, entries)
 
   // countgth calc
-  avail := avail - io.enq.push + io.deq.fire()
+  avail := avail - io.enq.push + io.deq.fire
 }
 
 object MultiTailedQueue {
diff --git a/src/main/scala/gemmini/PE.scala b/src/main/scala/gemmini/PE.scala
index e10318a3..5f7205bd 100644
--- a/src/main/scala/gemmini/PE.scala
+++ b/src/main/scala/gemmini/PE.scala
@@ -9,7 +9,6 @@ class PEControl[T <: Data : Arithmetic](accType: T) extends Bundle {
   val propagate = UInt(1.W) // Which register should be propagated (and which should be accumulated)?
   val shift = UInt(log2Up(accType.getWidth).W) // TODO this isn't correct for Floats
 
-  override def cloneType: PEControl.this.type = new PEControl(accType).asInstanceOf[this.type]
 }
 
 // TODO update documentation
diff --git a/src/main/scala/gemmini/Pipeline.scala b/src/main/scala/gemmini/Pipeline.scala
index 323686d1..0aeafd18 100644
--- a/src/main/scala/gemmini/Pipeline.scala
+++ b/src/main/scala/gemmini/Pipeline.scala
@@ -42,7 +42,7 @@ class Pipeline[T <: Data] (gen: T, latency: Int)(comb: Seq[T => T] = Seq.fill(la
       }
     }
     // When the pipeline stage behind you is valid then become true
-    when(io.in.fire()) {
+    when(io.in.fire) {
       valids.head := true.B
     }
     (valids.tail, valids.init).zipped.foreach { case (v2, v1) =>
@@ -52,7 +52,7 @@ class Pipeline[T <: Data] (gen: T, latency: Int)(comb: Seq[T => T] = Seq.fill(la
     }
 
     // Stages
-    when(io.in.fire()) {
+    when(io.in.fire) {
       stages.head := comb.head(io.in.bits)
     }
     io.out.bits := comb.last(stages.last)
diff --git a/src/main/scala/gemmini/PixelRepeater.scala b/src/main/scala/gemmini/PixelRepeater.scala
index e0eb4fd7..ddab4422 100644
--- a/src/main/scala/gemmini/PixelRepeater.scala
+++ b/src/main/scala/gemmini/PixelRepeater.scala
@@ -15,8 +15,6 @@ class PixelRepeaterReq[T <: Data, Tag <: Data](t: T, laddr_t: LocalAddr, block_c
   val tag: Tag = tag_t.cloneType
 
   assert(block_cols <= 255, "len must be longer")
-
-  override def cloneType: PixelRepeaterReq.this.type = new PixelRepeaterReq(t, laddr_t, block_cols, aligned_to, tag_t).asInstanceOf[this.type]
 }
 
 class PixelRepeaterResp[T <: Data, Tag <: Data](t: T, laddr_t: LocalAddr, block_cols: Int, aligned_to: Int, tag_t: Tag) extends Bundle {
@@ -25,8 +23,6 @@ class PixelRepeaterResp[T <: Data, Tag <: Data](t: T, laddr_t: LocalAddr, block_
   val laddr: LocalAddr = laddr_t.cloneType
   val last: Bool = Bool()
   val tag: Tag = tag_t.cloneType
-
-  override def cloneType: PixelRepeaterResp.this.type = new PixelRepeaterResp(t, laddr_t, block_cols, aligned_to, tag_t).asInstanceOf[this.type]
 }
 
 class PixelRepeater[T <: Data, Tag <: Data](t: T, laddr_t: LocalAddr, block_cols: Int, aligned_to: Int, tag_t: Tag, passthrough: Boolean) extends Module {
@@ -75,7 +71,7 @@ class PixelRepeater[T <: Data, Tag <: Data](t: T, laddr_t: LocalAddr, block_cols
 
     io.resp.valid := req.valid && !underflow
 
-    when(io.resp.fire() || underflow) {
+    when(io.resp.fire || underflow) {
       req.bits.pixel_repeats := req.bits.pixel_repeats - 1.U
 
       when(req.bits.pixel_repeats === 0.U) {
@@ -83,12 +79,12 @@ class PixelRepeater[T <: Data, Tag <: Data](t: T, laddr_t: LocalAddr, block_cols
       }
     }
 
-    when(io.req.fire()) {
+    when(io.req.fire) {
       req.push(io.req.bits)
       req.bits.pixel_repeats := io.req.bits.pixel_repeats - 1.U
     }
 
-    when(reset.toBool()) {
+    when(reset.asBool()) {
       req.pop()
     }
   }
diff --git a/src/main/scala/gemmini/ReservationStation.scala b/src/main/scala/gemmini/ReservationStation.scala
index 7135969f..e8c6ed26 100644
--- a/src/main/scala/gemmini/ReservationStation.scala
+++ b/src/main/scala/gemmini/ReservationStation.scala
@@ -21,7 +21,6 @@ class ReservationStationIssue[T <: Data](cmd_t: T, rob_entries: Int) extends Bun
 
   def fire(dummy: Int=0) = valid && ready
 
-  override def cloneType: this.type = new ReservationStationIssue(cmd_t, rob_entries).asInstanceOf[this.type]
 }
 
 // TODO we don't need to store the full command in here. We should be able to release the command directly into the relevant controller and only store the associated metadata in the ROB. This would reduce the size considerably
@@ -70,7 +69,7 @@ class ReservationStation[T <: Data : Arithmetic, U <: Data, V <: Data](config: G
   }
 
   val instructions_allocated = RegInit(0.U(32.W))
-  when (io.alloc.fire()) {
+  when (io.alloc.fire) {
     instructions_allocated := instructions_allocated + 1.U
   }
   dontTouch(instructions_allocated)
@@ -131,7 +130,7 @@ class ReservationStation[T <: Data : Arithmetic, U <: Data, V <: Data](config: G
   val new_partial_allocs = Wire(Vec(reservation_station_partial_entries, Bool()))
   new_partial_allocs.foreach(_ := false.B)
   val new_entry_oh = new_full_allocs ++ new_partial_allocs
-  val alloc_fire = io.alloc.fire()
+  val alloc_fire = io.alloc.fire
 
   val raws_probe = WireInit(0.U(rob_entries.W))
   val waws_probe = WireInit(0.U(rob_entries.W))
@@ -367,7 +366,7 @@ class ReservationStation[T <: Data : Arithmetic, U <: Data, V <: Data](config: G
       new_full_allocs(full_alloc_id) := true.B
     }
 
-    when (io.alloc.fire()) {
+    when (io.alloc.fire) {
       when (new_entry.is_config && new_entry.q === exq && !is_im2col) {
         a_stride := new_entry.cmd.rs1(31, 16) // TODO magic numbers // TODO this needs to be kept in sync with ExecuteController.scala
         c_stride := new_entry.cmd.rs2(63, 48) // TODO magic numbers // TODO this needs to be kept in sync with ExecuteController.scala
@@ -420,7 +419,7 @@ class ReservationStation[T <: Data : Arithmetic, U <: Data, V <: Data](config: G
   }
 
   // Mark entries as completed once they've returned
-  when (io.completed.fire()) {
+  when (io.completed.fire) {
     entries.foreach(_.bits.deps(io.completed.bits) := false.B)
 
     for ((e, i) <- entries.zipWithIndex) {
@@ -461,7 +460,7 @@ class ReservationStation[T <: Data : Arithmetic, U <: Data, V <: Data](config: G
 
   val cycles_since_issue = RegInit(0.U(16.W))
 
-  when (io.issue.ld.fire() || io.issue.st.fire() || io.issue.ex.fire() || !io.busy || io.completed.fire()) {
+  when (io.issue.ld.fire() || io.issue.st.fire() || io.issue.ex.fire() || !io.busy || io.completed.fire) {
     cycles_since_issue := 0.U
   }.elsewhen(io.busy) {
     cycles_since_issue := cycles_since_issue + 1.U
diff --git a/src/main/scala/gemmini/Scratchpad.scala b/src/main/scala/gemmini/Scratchpad.scala
index 2b97d5a9..bf3be036 100644
--- a/src/main/scala/gemmini/Scratchpad.scala
+++ b/src/main/scala/gemmini/Scratchpad.scala
@@ -24,7 +24,6 @@ class ScratchpadMemReadRequest[U <: Data](local_addr_t: LocalAddr, scale_t_bits:
   val cmd_id = UInt(8.W) // TODO don't use a magic number here
   val status = new MStatus
 
-  override def cloneType: this.type = new ScratchpadMemReadRequest(local_addr_t, scale_t_bits).asInstanceOf[this.type]
 }
 
 class ScratchpadMemWriteRequest(local_addr_t: LocalAddr, scale_t_bits: Int)
@@ -45,7 +44,6 @@ class ScratchpadMemWriteRequest(local_addr_t: LocalAddr, scale_t_bits: Int)
   val pool_en = Bool()
   val store_en = Bool()
 
-  override def cloneType: this.type = new ScratchpadMemWriteRequest(local_addr_t, scale_t_bits).asInstanceOf[this.type]
 }
 
 class ScratchpadMemWriteResponse extends Bundle {
@@ -61,7 +59,6 @@ class ScratchpadReadMemIO[U <: Data](local_addr_t: LocalAddr, scale_t_bits: Int)
   val req = Decoupled(new ScratchpadMemReadRequest(local_addr_t, scale_t_bits))
   val resp = Flipped(Valid(new ScratchpadMemReadResponse))
 
-  override def cloneType: this.type = new ScratchpadReadMemIO(local_addr_t, scale_t_bits).asInstanceOf[this.type]
 }
 
 class ScratchpadWriteMemIO(local_addr_t: LocalAddr, scale_t_bits: Int)
@@ -69,7 +66,6 @@ class ScratchpadWriteMemIO(local_addr_t: LocalAddr, scale_t_bits: Int)
   val req = Decoupled(new ScratchpadMemWriteRequest(local_addr_t, scale_t_bits))
   val resp = Flipped(Valid(new ScratchpadMemWriteResponse))
 
-  override def cloneType: this.type = new ScratchpadWriteMemIO(local_addr_t, scale_t_bits).asInstanceOf[this.type]
 }
 
 class ScratchpadReadReq(val n: Int) extends Bundle {
@@ -142,7 +138,7 @@ class ScratchpadBank(n: Int, w: Int, aligned_to: Int, single_ported: Boolean, us
   }
 
   val raddr = io.read.req.bits.addr
-  val ren = io.read.req.fire()
+  val ren = io.read.req.fire
   val rdata = if (single_ported) {
     assert(!(ren && io.write.en))
     read(raddr, ren && !io.write.en).asUInt()
@@ -158,7 +154,7 @@ class ScratchpadBank(n: Int, w: Int, aligned_to: Int, single_ported: Boolean, us
   q.io.enq.bits.data := rdata
   q.io.enq.bits.fromDMA := RegNext(fromDMA)
 
-  val q_will_be_empty = (q.io.count +& q.io.enq.fire()) - q.io.deq.fire() === 0.U
+  val q_will_be_empty = (q.io.count +& q.io.enq.fire) - q.io.deq.fire === 0.U
   io.read.req.ready := q_will_be_empty && !singleport_busy_with_write
 
   io.read.resp <> q.io.deq
@@ -292,7 +288,7 @@ class Scratchpad[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T,
 
     io.dma.write.resp.valid := false.B
     io.dma.write.resp.bits.cmd_id := write_dispatch_q.bits.cmd_id
-    when (write_dispatch_q.bits.laddr.is_garbage() && write_dispatch_q.fire()) {
+    when (write_dispatch_q.bits.laddr.is_garbage() && write_dispatch_q.fire) {
       io.dma.write.resp.valid := true.B
     }
 
@@ -397,9 +393,9 @@ class Scratchpad[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T,
     reader.module.io.resp.ready := Mux(reader.module.io.resp.bits.is_acc && reader.module.io.resp.bits.has_acc_bitwidth,
       mvin_scale_acc_in.ready, mvin_scale_in.ready)
 
-    val mvin_scale_finished = mvin_scale_pixel_repeater.io.resp.fire() && mvin_scale_pixel_repeater.io.resp.bits.last
-    val mvin_scale_acc_finished = mvin_scale_acc_out.fire() && mvin_scale_acc_out.bits.last
-    val zero_writer_finished = zero_writer_pixel_repeater.io.resp.fire() && zero_writer_pixel_repeater.io.resp.bits.last
+    val mvin_scale_finished = mvin_scale_pixel_repeater.io.resp.fire && mvin_scale_pixel_repeater.io.resp.bits.last
+    val mvin_scale_acc_finished = mvin_scale_acc_out.fire && mvin_scale_acc_out.bits.last
+    val zero_writer_finished = zero_writer_pixel_repeater.io.resp.fire && zero_writer_pixel_repeater.io.resp.bits.last
 
     val zero_writer_bytes_read = Mux(zero_writer_pixel_repeater.io.resp.bits.laddr.is_acc_addr,
       zero_writer_pixel_repeater.io.resp.bits.tag.cols * (accType.getWidth / 8).U,
@@ -460,7 +456,7 @@ class Scratchpad[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T,
           bio.read.req.bits.addr := write_dispatch_q.bits.laddr.sp_row()
           bio.read.req.bits.fromDMA := true.B
 
-          when (bio.read.req.fire()) {
+          when (bio.read.req.fire) {
             write_dispatch_q.ready := true.B
             write_scale_q.io.enq.valid := true.B
 
@@ -485,7 +481,7 @@ class Scratchpad[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T,
         dma_read_pipe.ready := writer.module.io.req.ready &&
           !write_issue_q.io.deq.bits.laddr.is_acc_addr && write_issue_q.io.deq.bits.laddr.sp_bank() === i.U && // I believe we don't need to check that write_issue_q is valid here, because if the SRAM's resp is valid, then that means that the write_issue_q's deq should also be valid
           !write_issue_q.io.deq.bits.laddr.is_garbage()
-        when (dma_read_pipe.fire()) {
+        when (dma_read_pipe.fire) {
           writeData.valid := true.B
           writeData.bits := dma_read_pipe.bits.data
         }
@@ -630,7 +626,7 @@ class Scratchpad[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T,
           bio.read.req.bits.scale := write_dispatch_q.bits.acc_scale.asTypeOf(bio.read.req.bits.scale)
           bio.read.req.bits.fromDMA := true.B
 
-          when (bio.read.req.fire()) {
+          when (bio.read.req.fire) {
             write_dispatch_q.ready := true.B
             write_scale_q.io.enq.valid := true.B
 
@@ -701,10 +697,10 @@ class Scratchpad[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T,
         val consecutive_write_block = RegInit(false.B)
         if (acc_singleported) {
           val consecutive_write_sub_bank = RegInit(0.U((1 max log2Ceil(acc_sub_banks)).W))
-          when (bio.write.fire() && bio.write.bits.acc &&
+          when (bio.write.fire && bio.write.bits.acc &&
             (bio.write.bits.addr(log2Ceil(acc_sub_banks)-1,0) === consecutive_write_sub_bank)) {
             consecutive_write_block := true.B
-          } .elsewhen (bio.write.fire() && bio.write.bits.acc) {
+          } .elsewhen (bio.write.fire && bio.write.bits.acc) {
             consecutive_write_block := false.B
             consecutive_write_sub_bank := bio.write.bits.addr(log2Ceil(acc_sub_banks)-1,0)
           } .otherwise {
diff --git a/src/main/scala/gemmini/SharedExtMem.scala b/src/main/scala/gemmini/SharedExtMem.scala
index 9d0e1802..f3acdd2e 100644
--- a/src/main/scala/gemmini/SharedExtMem.scala
+++ b/src/main/scala/gemmini/SharedExtMem.scala
@@ -20,7 +20,6 @@ class ExtMemIO extends Bundle {
 class ExtSpadMemIO(sp_banks: Int, acc_banks: Int, acc_sub_banks: Int) extends Bundle {
   val spad = Vec(sp_banks, new ExtMemIO)
   val acc = Vec(acc_banks, Vec(acc_sub_banks, new ExtMemIO))
-  override def cloneType: this.type = new ExtSpadMemIO(sp_banks, acc_banks, acc_sub_banks).asInstanceOf[this.type]
 }
 
 
diff --git a/src/main/scala/gemmini/StoreController.scala b/src/main/scala/gemmini/StoreController.scala
index 28de72c3..e2c82dd5 100644
--- a/src/main/scala/gemmini/StoreController.scala
+++ b/src/main/scala/gemmini/StoreController.scala
@@ -156,7 +156,7 @@ class StoreController[T <: Data : Arithmetic, U <: Data, V <: Data](config: Gemm
   cmd_tracker.io.alloc.bits.bytes_to_read := Mux(!pooling_is_enabled, Mux(mvout_1d_enabled, mvout_1d_rows, rows*blocks), pool_total_rows) // TODO do we have to add upad and lpad to this?
   cmd_tracker.io.alloc.bits.tag.rob_id := cmd.bits.rob_id.bits
 
-  cmd_tracker.io.request_returned.valid := io.dma.resp.fire() // TODO use a bundle connect
+  cmd_tracker.io.request_returned.valid := io.dma.resp.fire // TODO use a bundle connect
   cmd_tracker.io.request_returned.bits.cmd_id := io.dma.resp.bits.cmd_id // TODO use a bundle connect
   cmd_tracker.io.request_returned.bits.bytes_read := 1.U
   cmd_tracker.io.cmd_completed.ready := io.completed.ready
@@ -170,7 +170,7 @@ class StoreController[T <: Data : Arithmetic, U <: Data, V <: Data](config: Gemm
   io.busy := cmd.valid || cmd_tracker.io.busy
 
   // Row counter
-  when (io.dma.req.fire()) {
+  when (io.dma.req.fire) {
     when (!pooling_is_enabled) {
       //where does rows come from?
       //row_counter := wrappingAdd(row_counter, 1.U, rows)
@@ -223,20 +223,20 @@ class StoreController[T <: Data : Arithmetic, U <: Data, V <: Data](config: Gemm
         }
           .elsewhen(DoStore && cmd_tracker.io.alloc.fire()) {
             val next_state = Mux(pooling_is_enabled, pooling, sending_rows)
-            control_state := Mux(io.dma.req.fire(), next_state, waiting_for_dma_req_ready)
+            control_state := Mux(io.dma.req.fire, next_state, waiting_for_dma_req_ready)
           }
       }
     }
 
     is (waiting_for_dma_req_ready) {
-      when (io.dma.req.fire()) {
+      when (io.dma.req.fire) {
         control_state := Mux(pooling_is_enabled, pooling, sending_rows)
       }
     }
 
     is (sending_rows) {
-      val last_block = block_counter === blocks - 1.U && io.dma.req.fire()
-      val last_row = Mux(mvout_1d_enabled, row_counter === mvout_1d_rows - 1.U, row_counter === rows - 1.U) && io.dma.req.fire()
+      val last_block = block_counter === blocks - 1.U && io.dma.req.fire
+      val last_row = Mux(mvout_1d_enabled, row_counter === mvout_1d_rows - 1.U, row_counter === rows - 1.U) && io.dma.req.fire
       //normal mvout: row, 1D mvout: orows*ocols
 
       val only_one_dma_req = block_counter === 0.U && row_counter === 0.U // This is a special case when only one DMA request is made
@@ -251,7 +251,7 @@ class StoreController[T <: Data : Arithmetic, U <: Data, V <: Data](config: Gemm
       // TODO Is it really possible for all the counters to be 0 here?
       val last_row = (porow_counter === 0.U && pocol_counter === 0.U && wrow_counter === 0.U && wcol_counter === 0.U) ||
         (porow_counter === pool_porows - 1.U && pocol_counter === pool_pocols - 1.U &&
-          wrow_counter === pool_size - 1.U && wcol_counter === pool_size - 1.U && io.dma.req.fire())
+          wrow_counter === pool_size - 1.U && wcol_counter === pool_size - 1.U && io.dma.req.fire)
 
       when (last_row) {
         control_state := waiting_for_command
diff --git a/src/main/scala/gemmini/SyncMem.scala b/src/main/scala/gemmini/SyncMem.scala
index 799e45c5..43200015 100644
--- a/src/main/scala/gemmini/SyncMem.scala
+++ b/src/main/scala/gemmini/SyncMem.scala
@@ -10,7 +10,6 @@ class SinglePortedSyncMemIO[T <: Data](n: Int, t: T) extends Bundle {
   val wen = Input(Bool())
   val ren = Input(Bool())
 
-  override def cloneType = (new SinglePortedSyncMemIO(n, t)).asInstanceOf[this.type]
 }
 
 class SinglePortSyncMem[T <: Data](n: Int, t: T) extends Module {
diff --git a/src/main/scala/gemmini/TagQueue.scala b/src/main/scala/gemmini/TagQueue.scala
index 57b9cf4e..9a6464c3 100644
--- a/src/main/scala/gemmini/TagQueue.scala
+++ b/src/main/scala/gemmini/TagQueue.scala
@@ -28,19 +28,19 @@ class TagQueue[T <: Data with TagQueueTag](t: T, entries: Int) extends Module {
   io.deq.bits := regs(raddr)
   io.all := regs
 
-  when (io.enq.fire()) {
+  when (io.enq.fire) {
     regs(waddr) := io.enq.bits
     waddr := wrappingAdd(waddr, 1.U, entries)
   }
 
-  when (io.deq.fire()) {
+  when (io.deq.fire) {
     regs(raddr).make_this_garbage()
     raddr := wrappingAdd(raddr, 1.U, entries)
   }
 
-  when (io.enq.fire() && !io.deq.fire()) {
+  when (io.enq.fire && !io.deq.fire) {
     len := len + 1.U
-  }.elsewhen(!io.enq.fire() && io.deq.fire()) {
+  }.elsewhen(!io.enq.fire && io.deq.fire) {
     len := len - 1.U
   }
 
diff --git a/src/main/scala/gemmini/TilerController.scala b/src/main/scala/gemmini/TilerController.scala
index 87ebff98..f3275790 100644
--- a/src/main/scala/gemmini/TilerController.scala
+++ b/src/main/scala/gemmini/TilerController.scala
@@ -26,8 +26,6 @@ class TilerCmd(OTYPE_BITS_IDX: Int)
   val repeating_bias = Bool()
   val status         = new MStatus
 
-  override def cloneType: this.type =
-    (new TilerCmd(OTYPE_BITS_IDX)).asInstanceOf[this.type]
 }
 
 
diff --git a/src/main/scala/gemmini/TilerFSM.scala b/src/main/scala/gemmini/TilerFSM.scala
index b60a5991..db400f96 100644
--- a/src/main/scala/gemmini/TilerFSM.scala
+++ b/src/main/scala/gemmini/TilerFSM.scala
@@ -107,7 +107,7 @@ class TilerFSM[T <: Data : Arithmetic, U <: Data, V <: Data]
   // combinational calculation of optimal output-groups. this is updated at
   // the s_IDLE -> s_RESET_OUTPUT_GROUP state transition
   //------------------------------------------------------------------------
-  val g_OG_DIM_SELECT = OG_HEIGHT_MAP.zipWithIndex.map{ case(h,i) => 
+  val g_OG_DIM_SELECT = OG_HEIGHT_MAP.zipWithIndex.map{ case(h,i) =>
     val w = TOTAL_ACC_TILES/h
     if (h < w)      WireDefault(g_TILE_ROW_END < h.U)
     else if(h > w)  WireDefault(g_TILE_COL_END < w.U)
@@ -198,9 +198,9 @@ class TilerFSM[T <: Data : Arithmetic, U <: Data, V <: Data]
 
   // continuous assigns (only added in the switch-cases that call this!)
   def update_tile_dims(dummy: Int = 0) = {
-    gbl_item_rows     := Mux(gbl_tile_row_n === g_TILE_ROW_END, 
+    gbl_item_rows     := Mux(gbl_tile_row_n === g_TILE_ROW_END,
                              g_LAST_M_ITEMS, DIM.U)
-    gbl_item_cols     := Mux(gbl_tile_col_n === g_TILE_COL_END, 
+    gbl_item_cols     := Mux(gbl_tile_col_n === g_TILE_COL_END,
                              g_LAST_N_ITEMS, DIM.U)
     loop2_k_item_dims := Mux(loop2_k_tile_col_n === g_K_TILE_COL_END,
                              g_LAST_K_ITEMS, DIM.U)
@@ -246,7 +246,7 @@ class TilerFSM[T <: Data : Arithmetic, U <: Data, V <: Data]
       g_LAST_N_ITEMS := Mux(cmd.n(LOG2_DIM-1,0).orR,cmd.n(LOG2_DIM-1,0),DIM.U)
       g_LAST_K_ITEMS := Mux(cmd.k(LOG2_DIM-1,0).orR,cmd.k(LOG2_DIM-1,0),DIM.U)
 
-      g_TILE_ROW_END   := (cmd.m >> LOG2_DIM) + cmd.m(LOG2_DIM-1,0).orR - 1.U 
+      g_TILE_ROW_END   := (cmd.m >> LOG2_DIM) + cmd.m(LOG2_DIM-1,0).orR - 1.U
       g_TILE_COL_END   := (cmd.n >> LOG2_DIM) + cmd.n(LOG2_DIM-1,0).orR - 1.U
       g_K_TILE_COL_END := (cmd.k >> LOG2_DIM) + cmd.k(LOG2_DIM-1,0).orR - 1.U
 
@@ -256,7 +256,7 @@ class TilerFSM[T <: Data : Arithmetic, U <: Data, V <: Data]
 
       // issue gemmini commands
       // NOTE: the "h10000".U(17) is because a_addr_stride was added to ExecuteController
-      when(io.cmd_in.fire()) {
+      when(io.cmd_in.fire) {
         sched.push               := 2.U
         sched.bits(0).inst.funct := CONFIG_CMD
         sched.bits(0).rs1        := (g_ACC_OUT_RSHIFT << 32) |
@@ -639,7 +639,7 @@ class TilerFSM[T <: Data : Arithmetic, U <: Data, V <: Data]
       val l_did_row_incr = WireDefault(false.B)
       val l_did_col_incr = WireDefault(false.B)
 
-      when (gbl_tile_col === g_TILE_COL_END && 
+      when (gbl_tile_col === g_TILE_COL_END &&
             gbl_tile_row === g_TILE_ROW_END) {
         // update next state
         state := s_IDLE
@@ -658,7 +658,7 @@ class TilerFSM[T <: Data : Arithmetic, U <: Data, V <: Data]
           update_tile_dims()
           l_did_col_incr := true.B
         }
-   
+
         // reset global state that resets for each new output-group
         gbl_CD_acc_row_addr := 0.U
 
@@ -672,11 +672,11 @@ class TilerFSM[T <: Data : Arithmetic, U <: Data, V <: Data]
 
         loop1_tile_col_start := l_tile_col_start
         loop1_tile_col_end   := l_tile_col_end
-                                
+
         loop1_tile_row_start := l_tile_row_start
         loop1_tile_row_end   := l_tile_row_end
-                                
-         
+
+
         // update all derived pointers to matrices in memory
         when(l_did_row_incr) {
           loop1_A_mem_addr := g_A_MEM_ADDR + (l_tile_row_start *
@@ -693,7 +693,7 @@ class TilerFSM[T <: Data : Arithmetic, U <: Data, V <: Data]
           loop1_A_mem_addr := loop1_A_mem_addr + 0.U
           loop1_B_mem_addr := loop1_B_mem_addr + g_I_BYTE_COLS_PER_GROUP
           loop1_C_mem_addr := loop1_C_mem_addr + g_I_BYTE_COLS_PER_GROUP
-          loop1_D_mem_addr := loop1_D_mem_addr + 
+          loop1_D_mem_addr := loop1_D_mem_addr +
                               Mux(!g_HAS_BIAS, 0.U, g_O_BYTE_COLS_PER_GROUP)
         }
 
diff --git a/src/main/scala/gemmini/TilerScheduler.scala b/src/main/scala/gemmini/TilerScheduler.scala
index a1225ed7..c09ff949 100644
--- a/src/main/scala/gemmini/TilerScheduler.scala
+++ b/src/main/scala/gemmini/TilerScheduler.scala
@@ -48,9 +48,9 @@ class TilerScheduler[T <: Data: Arithmetic, U <: Data, V <: Data]
     val is_acc = Bool()
     val start  = UInt(30.W) // TODO magic number
     val end    = UInt(30.W) // TODO magic number
-    def overlaps(other: SPRange) = valid && other.valid && 
+    def overlaps(other: SPRange) = valid && other.valid &&
                                    (is_acc === other.is_acc) &&
-                                   (start < other.end) && 
+                                   (start < other.end) &&
                                    (end > other.start)
   }
 
@@ -93,14 +93,14 @@ class TilerScheduler[T <: Data: Arithmetic, U <: Data, V <: Data]
 
   val new_entry = Wire(new Entry)
   new_entry := DontCare
-  val new_entry_id = MuxCase((ROB_ENTRIES-1).U, entries.zipWithIndex.map { 
+  val new_entry_id = MuxCase((ROB_ENTRIES-1).U, entries.zipWithIndex.map {
                                         case (e, i) => !e.valid -> i.U })
-  val alloc_fire = io.cmd_in.fire()
+  val alloc_fire = io.cmd_in.fire
 
-  when (io.cmd_in.fire()) {
+  when (io.cmd_in.fire) {
     val cmd = io.cmd_in.bits
     val funct = cmd.inst.funct
-    val funct_is_compute = funct === COMPUTE_AND_STAY_CMD || 
+    val funct_is_compute = funct === COMPUTE_AND_STAY_CMD ||
                            funct === COMPUTE_AND_FLIP_CMD
     val funct_is_compute_preload = funct === COMPUTE_AND_FLIP_CMD
     val config_cmd_type = cmd.rs1(1,0) // TODO magic numbers
@@ -121,22 +121,22 @@ class TilerScheduler[T <: Data: Arithmetic, U <: Data, V <: Data]
     new_entry.op2.valid := funct_is_compute || funct === STORE_CMD
     new_entry.op2.is_acc := cmd.rs2(31)
     new_entry.op2.start := cmd.rs2(29,0)
-    new_entry.op2.end   := cmd.rs2(29,0) + 
+    new_entry.op2.end   := cmd.rs2(29,0) +
                            Mux(funct_is_compute, DIM.U, mvin_mvout_rows)
 
     new_entry.dst.valid := funct === PRELOAD_CMD || funct === LOAD_CMD
     new_entry.dst.is_acc := cmd.rs2(31)
     new_entry.dst.start := cmd.rs2(29,0)
-    new_entry.dst.end   := cmd.rs2(29,0) + 
-                           Mux(funct === PRELOAD_CMD, DIM.U, 
+    new_entry.dst.end   := cmd.rs2(29,0) +
+                           Mux(funct === PRELOAD_CMD, DIM.U,
                             mvin_mvout_rows)
 
-    val is_load    = (funct === LOAD_CMD) || 
+    val is_load    = (funct === LOAD_CMD) ||
                      (funct === CONFIG_CMD && config_cmd_type === CONFIG_LOAD)
-    val is_store   = (funct === STORE_CMD) || 
+    val is_store   = (funct === STORE_CMD) ||
                      (funct === CONFIG_CMD && config_cmd_type === CONFIG_STORE)
     val is_exec    = funct === PRELOAD_CMD ||
-                     funct_is_compute || 
+                     funct_is_compute ||
                      (funct === CONFIG_CMD && config_cmd_type === CONFIG_EX)
     val is_preload = funct === PRELOAD_CMD
 
@@ -153,22 +153,22 @@ class TilerScheduler[T <: Data: Arithmetic, U <: Data, V <: Data]
     when(new_entry.is_config) {
       when (new_entry.is_load) {
         printf(
-          "cycle[%d], entry[%d], accept[%d], config_mvin[stride=%x]\n", 
-          debug_cycle, new_entry_id, cmd_id.value, 
+          "cycle[%d], entry[%d], accept[%d], config_mvin[stride=%x]\n",
+          debug_cycle, new_entry_id, cmd_id.value,
           new_entry.cmd.rs2)
       }
       .elsewhen (new_entry.is_store) {
         printf(
-          "cycle[%d], entry[%d], accept[%d], config_mvout[stride=%x]\n", 
-          debug_cycle, new_entry_id, cmd_id.value, 
+          "cycle[%d], entry[%d], accept[%d], config_mvout[stride=%x]\n",
+          debug_cycle, new_entry_id, cmd_id.value,
           new_entry.cmd.rs2)
       }
       .otherwise {
         assert(new_entry.is_exec)
         printf(
           "cycle[%d], entry[%d], accept[%d], " +
-          "config_ex[matmul_rshift=%x, acc_rshift=%x, relu6_lshift=%x]\n", 
-          debug_cycle, new_entry_id, cmd_id.value, 
+          "config_ex[matmul_rshift=%x, acc_rshift=%x, relu6_lshift=%x]\n",
+          debug_cycle, new_entry_id, cmd_id.value,
           cmd.rs1(63,32), cmd.rs2(31,0), cmd.rs2(63,32))
       }
     }
@@ -176,20 +176,20 @@ class TilerScheduler[T <: Data: Arithmetic, U <: Data, V <: Data]
       printf(
         "cycle[%d], entry[%d], accept[%d], " +
         "mvin[dram=%x, spad=%x, rows=%x, cols=%x]\n",
-        debug_cycle, new_entry_id, cmd_id.value, 
+        debug_cycle, new_entry_id, cmd_id.value,
         cmd.rs1, cmd.rs2(31,0), cmd.rs2(63,48), cmd.rs2(47,32))
     }
     .elsewhen (new_entry.is_store) {
       printf(
-        "cycle[%d], entry[%d], accept[%d], " + 
+        "cycle[%d], entry[%d], accept[%d], " +
         "mvout[dram=%x, spad=%x, rows=%x, cols=%x]\n",
-        debug_cycle, new_entry_id, cmd_id.value, 
+        debug_cycle, new_entry_id, cmd_id.value,
         cmd.rs1, cmd.rs2(31,0), cmd.rs2(63,48), cmd.rs2(47,32))
     }
     .elsewhen (new_entry.is_preload) {
       printf(
         "cycle[%d], entry[%d], accept[%d], preload[B=%x, C=%x]\n",
-        debug_cycle, new_entry_id, cmd_id.value, 
+        debug_cycle, new_entry_id, cmd_id.value,
         cmd.rs1(31,0), cmd.rs2(31,0))
     }
     .otherwise {
@@ -197,13 +197,13 @@ class TilerScheduler[T <: Data: Arithmetic, U <: Data, V <: Data]
       when (funct_is_compute_preload) {
         printf(
           "cycle[%d], entry[%d], accept[%d], ex.pre[A=%x, D=%x]\n",
-          debug_cycle, new_entry_id, cmd_id.value, 
+          debug_cycle, new_entry_id, cmd_id.value,
           cmd.rs1(31,0), cmd.rs2(31,0))
       }
       .otherwise {
         printf(
           "cycle[%d], entry[%d], accept[%d], ex.acc[A=%x, D=%x]\n",
-          debug_cycle, new_entry_id, cmd_id.value, 
+          debug_cycle, new_entry_id, cmd_id.value,
           cmd.rs1(31,0), cmd.rs2(31,0))
       }
     }
@@ -228,12 +228,12 @@ class TilerScheduler[T <: Data: Arithmetic, U <: Data, V <: Data]
     )}
 
     // We search for all entries which write to an address that we write to
-    val waws = entries.map { e => e.valid && 
+    val waws = entries.map { e => e.valid &&
       new_entry.dst.overlaps(e.bits.dst)
     }
 
-    val older_in_same_q = entries.map { e => e.valid && 
-      e.bits.q === new_entry.q && 
+    val older_in_same_q = entries.map { e => e.valid &&
+      e.bits.q === new_entry.q &&
       !e.bits.issued
     }
 
@@ -247,11 +247,11 @@ class TilerScheduler[T <: Data: Arithmetic, U <: Data, V <: Data]
       (new_entry.q === exq && new_entry.is_config)
     }
 
-    new_entry.deps := (Cat(raws) | 
-                       Cat(wars) | 
-                       Cat(waws) | 
+    new_entry.deps := (Cat(raws) |
+                       Cat(wars) |
+                       Cat(waws) |
                        Cat(older_in_same_q) |
-                       Cat(is_st_and_must_wait_for_prior_ex_config) | 
+                       Cat(is_st_and_must_wait_for_prior_ex_config) |
                        Cat(is_ex_config_and_must_wait_for_prior_st)
                       ).asBools().reverse
 
@@ -264,20 +264,20 @@ class TilerScheduler[T <: Data: Arithmetic, U <: Data, V <: Data]
   }
 
   // Issue commands which are ready to be issued
-  Seq((ldq, io.issue.load), 
-      (stq, io.issue.store), 
+  Seq((ldq, io.issue.load),
+      (stq, io.issue.store),
       (exq, io.issue.exec)).foreach { case (q, io) =>
-    val issue_id = MuxCase((ROB_ENTRIES-1).U, entries.zipWithIndex.map { 
-      case (e, i) => (e.valid && e.bits.ready() && 
+    val issue_id = MuxCase((ROB_ENTRIES-1).U, entries.zipWithIndex.map {
+      case (e, i) => (e.valid && e.bits.ready() &&
                       !e.bits.issued && e.bits.q === q) -> i.U
     })
-    io.valid := entries.map(e => e.valid && e.bits.ready() && !e.bits.issued 
+    io.valid := entries.map(e => e.valid && e.bits.ready() && !e.bits.issued
                                  && e.bits.q === q).reduce(_ || _)
     io.bits.cmd := entries(issue_id).bits.cmd
     io.bits.rob_id.push(issue_id)
 
     // ssteff: added for debug
-    when(io.fire()) {
+    when(io.fire) {
       //======================================================================
       // debug
       //======================================================================
@@ -287,7 +287,7 @@ class TilerScheduler[T <: Data: Arithmetic, U <: Data, V <: Data]
             "cycle[%d], entry[%d],  issue[%d], config_mvin\n",
             debug_cycle, issue_id, entries(issue_id).bits.cmd_id)
           printf(
-            "cycle[%d], entry[%d],  final[%d], config_mvin\n", 
+            "cycle[%d], entry[%d],  final[%d], config_mvin\n",
             debug_cycle, issue_id, entries(issue_id).bits.cmd_id)
         }
         .elsewhen (entries(issue_id).bits.is_store) {
@@ -295,7 +295,7 @@ class TilerScheduler[T <: Data: Arithmetic, U <: Data, V <: Data]
             "cycle[%d], entry[%d],  issue[%d], config_mvout\n",
             debug_cycle, issue_id, entries(issue_id).bits.cmd_id)
           printf(
-            "cycle[%d], entry[%d],  final[%d], config_mvout\n", 
+            "cycle[%d], entry[%d],  final[%d], config_mvout\n",
             debug_cycle, issue_id, entries(issue_id).bits.cmd_id)
         }
         .otherwise {
@@ -330,7 +330,7 @@ class TilerScheduler[T <: Data: Arithmetic, U <: Data, V <: Data]
 
       entries(issue_id).bits.issued := true.B
 
-      // Clear out all the dependency bits for instructions which 
+      // Clear out all the dependency bits for instructions which
       // depend on the same queue
       entries.zipWithIndex.foreach { case (e, i) =>
         val is_same_q = Mux(alloc_fire && new_entry_id === i.U,
@@ -347,7 +347,7 @@ class TilerScheduler[T <: Data: Arithmetic, U <: Data, V <: Data]
   }
 
   // Mark entries as completed once they've returned
-  when (io.completed.fire()) {
+  when (io.completed.fire) {
     //======================================================================
     // debug
     //======================================================================
@@ -356,32 +356,32 @@ class TilerScheduler[T <: Data: Arithmetic, U <: Data, V <: Data]
       assert(entries(io.completed.bits).bits.is_exec)
       printf(
         "cycle[%d], entry[%d],  final[%d], config_ex\n",
-        debug_cycle, io.completed.bits, 
+        debug_cycle, io.completed.bits,
         entries(io.completed.bits).bits.cmd_id)
     }
     .elsewhen (entries(io.completed.bits).bits.is_load) {
       printf(
         "cycle[%d], entry[%d],  final[%d], mvin\n",
-        debug_cycle, io.completed.bits, 
+        debug_cycle, io.completed.bits,
         entries(io.completed.bits).bits.cmd_id)
     }
     .elsewhen (entries(io.completed.bits).bits.is_store) {
       printf(
         "cycle[%d], entry[%d],  final[%d], mvout\n",
-        debug_cycle, io.completed.bits, 
+        debug_cycle, io.completed.bits,
         entries(io.completed.bits).bits.cmd_id)
     }
     .elsewhen (entries(io.completed.bits).bits.is_preload) {
       printf(
         "cycle[%d], entry[%d],  final[%d], preload\n",
-        debug_cycle, io.completed.bits, 
+        debug_cycle, io.completed.bits,
         entries(io.completed.bits).bits.cmd_id)
     }
     .otherwise {
       assert(entries(io.completed.bits).bits.is_exec)
       printf(
         "cycle[%d], entry[%d],  final[%d], ex\n",
-        debug_cycle, io.completed.bits, 
+        debug_cycle, io.completed.bits,
         entries(io.completed.bits).bits.cmd_id)
     }
     //======================================================================
@@ -393,14 +393,14 @@ class TilerScheduler[T <: Data: Arithmetic, U <: Data, V <: Data]
   }
 
   val util = PopCount(entries.map(e => e.valid))
-  val util_ld_q_unissued = PopCount(entries.map(e => e.valid && 
-                                                     !e.bits.issued && 
+  val util_ld_q_unissued = PopCount(entries.map(e => e.valid &&
+                                                     !e.bits.issued &&
                                                      e.bits.q === ldq))
-  val util_st_q_unissued = PopCount(entries.map(e => e.valid && 
-                                                     !e.bits.issued && 
+  val util_st_q_unissued = PopCount(entries.map(e => e.valid &&
+                                                     !e.bits.issued &&
                                                      e.bits.q === stq))
-  val util_ex_q_unissued = PopCount(entries.map(e => e.valid && 
-                                                     !e.bits.issued && 
+  val util_ex_q_unissued = PopCount(entries.map(e => e.valid &&
+                                                     !e.bits.issued &&
                                                      e.bits.q === exq))
   val util_ld_q = PopCount(entries.map(e => e.valid && e.bits.q === ldq))
   val util_st_q = PopCount(entries.map(e => e.valid && e.bits.q === stq))
@@ -417,9 +417,9 @@ class TilerScheduler[T <: Data: Arithmetic, U <: Data, V <: Data]
 
   val cycles_since_issue = RegInit(0.U(32.W))
 
-  when (io.issue.load.fire() || 
-        io.issue.store.fire() || 
-        io.issue.exec.fire() || 
+  when (io.issue.load.fire ||
+        io.issue.store.fire ||
+        io.issue.exec.fire ||
         !io.busy) {
     cycles_since_issue := 0.U
   } .elsewhen (io.busy) {
diff --git a/src/main/scala/gemmini/TransposePreloadUnroller.scala b/src/main/scala/gemmini/TransposePreloadUnroller.scala
index 0bac0e5b..fb3ef127 100644
--- a/src/main/scala/gemmini/TransposePreloadUnroller.scala
+++ b/src/main/scala/gemmini/TransposePreloadUnroller.scala
@@ -65,9 +65,9 @@ class TransposePreloadUnroller[T <: Data, U <: Data, V <: Data](config: GemminiA
     (state === second_preload) -> second_preload_cmd,
   ))
 
-  q.pop := Mux(io.out.fire() && !(first_preload && unroll_preload) && state =/= first_compute, 1.U, 0.U)
+  q.pop := Mux(io.out.fire && !(first_preload && unroll_preload) && state =/= first_compute, 1.U, 0.U)
 
-  when (io.out.fire()) {
+  when (io.out.fire) {
     when (is_config) {
       val set_only_strides = cmds(0).cmd.rs1(7)
       when (!set_only_strides) {
diff --git a/src/main/scala/gemmini/Transposer.scala b/src/main/scala/gemmini/Transposer.scala
index 1abbd840..23fc5365 100644
--- a/src/main/scala/gemmini/Transposer.scala
+++ b/src/main/scala/gemmini/Transposer.scala
@@ -19,8 +19,8 @@ class PipelinedTransposer[T <: Data](val dim: Int, val dataType: T) extends Tran
   val regArrayT = regArray.transpose
   val sMoveUp :: sMoveLeft :: Nil = Enum(2)
   val state = RegInit(sMoveUp)
-  val leftCounter = RegInit(0.U(log2Ceil(dim+1).W)) //(io.inRow.fire() && state === sMoveLeft, dim+1)
-  val upCounter = RegInit(0.U(log2Ceil(dim+1).W)) //Counter(io.inRow.fire() && state === sMoveUp, dim+1)
+  val leftCounter = RegInit(0.U(log2Ceil(dim+1).W)) //(io.inRow.fire && state === sMoveLeft, dim+1)
+  val upCounter = RegInit(0.U(log2Ceil(dim+1).W)) //Counter(io.inRow.fire && state === sMoveUp, dim+1)
 
   io.outCol.valid := 0.U
   io.inRow.ready := 0.U
@@ -28,14 +28,14 @@ class PipelinedTransposer[T <: Data](val dim: Int, val dataType: T) extends Tran
     is(sMoveUp) {
       io.inRow.ready := upCounter <= dim.U
       io.outCol.valid := leftCounter > 0.U
-      when(io.inRow.fire()) {
+      when(io.inRow.fire) {
         upCounter := upCounter + 1.U
       }
       when(upCounter === (dim-1).U) {
         state := sMoveLeft
         leftCounter := 0.U
       }
-      when(io.outCol.fire()) {
+      when(io.outCol.fire) {
         leftCounter := leftCounter - 1.U
       }
     }
@@ -45,11 +45,11 @@ class PipelinedTransposer[T <: Data](val dim: Int, val dataType: T) extends Tran
       when(leftCounter === (dim-1).U) {
         state := sMoveUp
       }
-      when(io.inRow.fire()) {
+      when(io.inRow.fire) {
         leftCounter := leftCounter + 1.U
         upCounter := 0.U
       }
-      when(io.outCol.fire()) {
+      when(io.outCol.fire) {
         upCounter := upCounter - 1.U
       }
     }
@@ -131,7 +131,7 @@ class AlwaysOutTransposer[T <: Data](val dim: Int, val dataType: T) extends Tran
 
   // Wire up global signals
   pes.flatten.foreach(_.io.dir := dir)
-  pes.flatten.foreach(_.io.en := io.inRow.fire())
+  pes.flatten.foreach(_.io.en := io.inRow.fire)
 
   io.outCol.valid := true.B
   io.inRow.ready := true.B
@@ -141,11 +141,11 @@ class AlwaysOutTransposer[T <: Data](val dim: Int, val dataType: T) extends Tran
 
   io.outCol.bits := Mux(dir === LEFT_DIR, left_out, up_out)
 
-  when (io.inRow.fire()) {
+  when (io.inRow.fire) {
     counter := wrappingAdd(counter, 1.U, dim)
   }
 
-  when (counter === (dim-1).U && io.inRow.fire()) {
+  when (counter === (dim-1).U && io.inRow.fire) {
     dir := ~dir
   }
 }
@@ -155,7 +155,7 @@ class NaiveTransposer[T <: Data](val dim: Int, val dataType: T) extends Transpos
   val regArrayT = regArray.transpose
   // state = 0 => filling regArray row-wise, state = 1 => draining regArray column-wise
   val state = RegInit(0.U(1.W))
-  val countInc = io.inRow.fire() || io.outCol.fire()
+  val countInc = io.inRow.fire || io.outCol.fire
   val (countValue, countWrap) = Counter(countInc, dim)
 
   io.inRow.ready := state === 0.U
@@ -163,7 +163,7 @@ class NaiveTransposer[T <: Data](val dim: Int, val dataType: T) extends Transpos
 
   for (i <- 0 until dim) {
     for (j <- 0 until dim) {
-      when(countValue === i.U && io.inRow.fire()) {
+      when(countValue === i.U && io.inRow.fire) {
         regArray(i)(j) := io.inRow.bits(j)
       }
     }
@@ -178,13 +178,13 @@ class NaiveTransposer[T <: Data](val dim: Int, val dataType: T) extends Transpos
     }
   }
 
-  when (io.inRow.fire() && countWrap) {
+  when (io.inRow.fire && countWrap) {
     state := 1.U
   }
-  when (io.outCol.fire() && countWrap) {
+  when (io.outCol.fire && countWrap) {
     state := 0.U
   }
 
-  assert(!(state === 0.U) || !io.outCol.fire())
-  assert(!(state === 1.U) || !io.inRow.fire())
+  assert(!(state === 0.U) || !io.outCol.fire)
+  assert(!(state === 1.U) || !io.inRow.fire)
 }
diff --git a/src/main/scala/gemmini/Util.scala b/src/main/scala/gemmini/Util.scala
index 907c4ad2..51dc1377 100644
--- a/src/main/scala/gemmini/Util.scala
+++ b/src/main/scala/gemmini/Util.scala
@@ -140,7 +140,6 @@ object Util {
       bits
     }
 
-    override def cloneType: this.type = new UDValid(t.cloneType).asInstanceOf[this.type]
   }
 
   object UDValid {
diff --git a/src/main/scala/gemmini/VectorScalarMultiplier.scala b/src/main/scala/gemmini/VectorScalarMultiplier.scala
index 271bf290..2311b381 100644
--- a/src/main/scala/gemmini/VectorScalarMultiplier.scala
+++ b/src/main/scala/gemmini/VectorScalarMultiplier.scala
@@ -13,7 +13,6 @@ class VectorScalarMultiplierReq[T <: Data, U <: Data, Tag <: Data](block_cols: I
   val last: Bool = Bool()
   val tag: Tag = tag_t.cloneType
 
-  override def cloneType: VectorScalarMultiplierReq.this.type = new VectorScalarMultiplierReq(block_cols, t, u, tag_t).asInstanceOf[this.type]
 }
 
 class VectorScalarMultiplierResp[T <: Data, Tag <: Data](block_cols: Int, t: T, tag_t: Tag) extends Bundle {
@@ -22,7 +21,6 @@ class VectorScalarMultiplierResp[T <: Data, Tag <: Data](block_cols: Int, t: T,
   val last: Bool = Bool()
   val tag: Tag = tag_t.cloneType
 
-  override def cloneType: VectorScalarMultiplierResp.this.type = new VectorScalarMultiplierResp(block_cols, t, tag_t).asInstanceOf[this.type]
 }
 
 class DataWithIndex[T <: Data, U <: Data](t: T, u: U) extends Bundle {
@@ -30,7 +28,6 @@ class DataWithIndex[T <: Data, U <: Data](t: T, u: U) extends Bundle {
   val scale = u.cloneType
   val id = UInt(2.W) // TODO hardcoded
   val index = UInt()
-  override def cloneType: DataWithIndex.this.type = new DataWithIndex(t, u).asInstanceOf[this.type]
 }
 
 class ScalePipe[T <: Data, U <: Data](t: T, mvin_scale_args: ScaleArguments[T, U]) extends Module {
@@ -69,7 +66,7 @@ class VectorScalarMultiplier[T <: Data, U <: Data, Tag <: Data](
   val in_fire = WireInit(false.B)
   io.req.ready := !in.valid || (in.bits.repeats === 0.U && in_fire)
 
-  when (io.req.fire()) {
+  when (io.req.fire) {
     in.valid := io.req.valid
     in.bits  := io.req.bits
   } .elsewhen (in_fire) {
@@ -88,7 +85,7 @@ class VectorScalarMultiplier[T <: Data, U <: Data, Tag <: Data](
       latency
     )())
     io.resp <> pipe.io.out
-    in_fire := pipe.io.in.fire()
+    in_fire := pipe.io.in.fire
 
     pipe.io.in.valid := in.valid
     pipe.io.in.bits.tag := in.bits.tag
@@ -111,7 +108,7 @@ class VectorScalarMultiplier[T <: Data, U <: Data, Tag <: Data](
 
     io.resp.valid := Mux1H(head_oh.asBools, (regs zip completed_masks).map({case (r,c) => r.valid && c.reduce(_&&_)}))
     io.resp.bits := Mux1H(head_oh.asBools, out_regs)
-    when (io.resp.fire()) {
+    when (io.resp.fire) {
       for (i <- 0 until nEntries) {
         when (head_oh(i)) {
           regs(i).valid := false.B
@@ -153,7 +150,7 @@ class VectorScalarMultiplier[T <: Data, U <: Data, Tag <: Data](
         input.bits.scale  := regs(i).bits.scale.asTypeOf(u)
         input.bits.id := i.U
         input.bits.index := w.U
-        when (input.fire()) {
+        when (input.fire) {
           fired_masks(i)(w) := true.B
         }
       }
@@ -176,7 +173,7 @@ class VectorScalarMultiplier[T <: Data, U <: Data, Tag <: Data](
       for (j <- 0 until nEntries) {
         for (w <- 0 until width) {
           if ((j*width+w) % num_scale_units == i) {
-            when (pipe_out.fire() && pipe_out.bits.id === j.U && pipe_out.bits.index === w.U) {
+            when (pipe_out.fire && pipe_out.bits.id === j.U && pipe_out.bits.index === w.U) {
               out_regs(j).out(w) := pipe_out.bits.data
               completed_masks(j)(w) := true.B
             }
diff --git a/src/main/scala/gemmini/WeightedArbiter.scala b/src/main/scala/gemmini/WeightedArbiter.scala
index a27decde..90a37273 100644
--- a/src/main/scala/gemmini/WeightedArbiter.scala
+++ b/src/main/scala/gemmini/WeightedArbiter.scala
@@ -61,7 +61,7 @@ class WeightedArbiter[T <: Data](t: T, maxWeightA: Int, staticWeightAEnabled: Bo
     }
   }
 
-  when (io.out.fire()) {
+  when (io.out.fire) {
     when (A_chosen) {
       count := satAdd(count, 1.U, weightA + 1.U)
     }.elsewhen(B_chosen) {
diff --git a/src/main/scala/gemmini/XactTracker.scala b/src/main/scala/gemmini/XactTracker.scala
index 84821d4e..277626a1 100644
--- a/src/main/scala/gemmini/XactTracker.scala
+++ b/src/main/scala/gemmini/XactTracker.scala
@@ -23,7 +23,6 @@ class XactTrackerEntry[U <: Data](maxShift: Int, spadWidth: Int, accWidth: Int,
   val bytes_to_read = UInt(log2Up(maxReqBytes+1).W)
   val cmd_id = UInt(log2Up(nCmds).W)
 
-  override def cloneType: XactTrackerEntry.this.type = new XactTrackerEntry(maxShift, spadWidth, accWidth, spadRows, accRows, maxReqBytes, mvin_scale_t_bits, nCmds).asInstanceOf[this.type]
 }
 
 class XactTrackerAllocIO[U <: Data](nXacts: Int, maxShift: Int, spadWidth: Int, accWidth :Int,
@@ -36,7 +35,6 @@ class XactTrackerAllocIO[U <: Data](nXacts: Int, maxShift: Int, spadWidth: Int,
 
   def fire(dummy: Int = 0) = valid && ready
 
-  override def cloneType: XactTrackerAllocIO.this.type = new XactTrackerAllocIO(nXacts, maxShift, spadWidth, accWidth, spadRows, accRows, maxReqBytes, mvin_scale_t_bits, nCmds).asInstanceOf[this.type]
 }
 
 class XactTrackerPeekIO[U <: Data](val nXacts: Int, val maxShift: Int, val spadWidth: Int, val accWidth: Int,
diff --git a/src/main/scala/gemmini/ZeroWriter.scala b/src/main/scala/gemmini/ZeroWriter.scala
index 5a30aad5..a5c10abe 100644
--- a/src/main/scala/gemmini/ZeroWriter.scala
+++ b/src/main/scala/gemmini/ZeroWriter.scala
@@ -11,7 +11,6 @@ class ZeroWriterReq[Tag <: Data](laddr_t: LocalAddr, max_cols: Int, tag_t: Tag)
   val block_stride = UInt(16.W) // TODO magic number
   val tag = tag_t
 
-  override def cloneType: ZeroWriterReq.this.type = new ZeroWriterReq(laddr_t.cloneType, max_cols, tag_t.cloneType).asInstanceOf[this.type]
 }
 
 class ZeroWriterResp[Tag <: Data](laddr_t: LocalAddr, block_cols: Int, tag_t: Tag) extends Bundle {
@@ -20,7 +19,6 @@ class ZeroWriterResp[Tag <: Data](laddr_t: LocalAddr, block_cols: Int, tag_t: Ta
   val last = Bool()
   val tag = tag_t
 
-  override def cloneType: ZeroWriterResp.this.type = new ZeroWriterResp(laddr_t, block_cols, tag_t.cloneType).asInstanceOf[this.type]
 }
 
 class ZeroWriter[T <: Data, U <: Data, V <: Data, Tag <: Data](config: GemminiArrayConfig[T, U, V], tag_t: Tag)
@@ -47,7 +45,7 @@ class ZeroWriter[T <: Data, U <: Data, V <: Data, Tag <: Data](config: GemminiAr
   io.resp.bits.last := col_counter +& block_cols.U >= req.bits.cols
   io.resp.bits.tag := req.bits.tag
 
-  when (io.resp.fire()) {
+  when (io.resp.fire) {
     val next_col_counter = floorAdd(col_counter, block_cols.U, req.bits.cols)
 
     col_counter := next_col_counter
@@ -58,7 +56,7 @@ class ZeroWriter[T <: Data, U <: Data, V <: Data, Tag <: Data](config: GemminiAr
     }
   }
 
-  when (io.req.fire()) {
+  when (io.req.fire) {
     req.push(io.req.bits)
 
     col_counter := 0.U