diff --git a/soc/hps_lattice_nx.py b/soc/hps_lattice_nx.py
index a28af6b96..18758ade8 100644
--- a/soc/hps_lattice_nx.py
+++ b/soc/hps_lattice_nx.py
@@ -45,11 +45,11 @@
 
 from migen import *
 from litex.soc.interconnect import wishbone
+from litex.soc.cores.clock.common import *
 
 kB = 1024
 
 
-
 def initval_parameters(contents, width):
     """
     In Radiant, initial values for LRAM are passed a sequence of parameters
@@ -203,3 +203,431 @@ def add_init(self, data):
                 offset = d * self.width_cascading * 64*kB + w * 64*kB
                 chunk = data[offset:offset + 64*kB]
                 self.lram_blocks[d][w].items += initval_parameters(chunk, self.width)
+
+
+from collections import namedtuple
+import logging
+import pprint
+from math import log10, pi
+from cmath import phase
+
+logging.basicConfig(level=logging.INFO)
+
+io_i2 = namedtuple('io_i2',['io', 'i2', 'IPP_CTRL', 'BW_CTL_BIAS', 'IPP_SEL'])
+nx_pll_param_permutation = namedtuple("nx_pll_param_permutation",[
+                                "C1","C2","C3","C4","C5","C6",
+                                "IPP_CTRL","BW_CTL_BIAS","IPP_SEL","CSET","CRIPPLE","V2I_PP_RES","IPI_CMP"])
+
+
+# Lattice / NX PLL ---------------------------------------------------------------------------------
+
+class NXPLL(Module):
+    nclkouts_max        = 5
+    clki_div_range      = ( 1, 128+1)
+    clkfb_div_range     = ( 1, 128+1)
+    clko_div_range      = ( 1, 128+1)
+    clki_freq_range     = ( 10e6,   500e6)
+    clko_freq_range     = ( 6.25e6, 800e6)
+    vco_in_freq_range   = ( 10e6,   500e6)
+    vco_out_freq_range  = ( 800e6,  1600e6)
+    instance_num        = 0
+
+    def __init__(self, platform = None, create_output_port_clocks=False):
+        self.logger = logging.getLogger("NXPLL")
+        self.logger.info("Creating NXPLL.")
+        self.params     = {}
+        self.reset      = Signal()
+        self.locked     = Signal()
+        self.params["o_LOCK"] = self.locked
+        self.clkin_freq = None
+        self.vcxo_freq  = None
+        self.nclkouts   = 0
+        self.clkouts    = {}
+        self.config     = {}
+        self.name       = 'PLL_' + str(NXPLL.instance_num)
+        NXPLL.instance_num += 1
+        self.platform   = platform
+        self.create_output_port_clocks = create_output_port_clocks
+        self.clk_names  = []
+        self.enable     = []
+
+        self.calc_valid_io_i2()
+        self.calc_tf_coefficients()
+
+    def register_clkin(self, clkin, freq):
+        (clki_freq_min, clki_freq_max) = self.clki_freq_range
+        assert freq >= clki_freq_min
+        assert freq <= clki_freq_max
+        self.clkin = Signal()
+        if isinstance(clkin, (Signal, ClockSignal)):
+            self.comb += self.clkin.eq(clkin)
+        else:
+            raise ValueError
+        self.clkin_freq = freq
+        register_clkin_log(self.logger, clkin, freq)
+
+    def create_clkout(self, cd, freq, phase=0, margin=1e-2):
+        (clko_freq_min, clko_freq_max) = self.clko_freq_range
+        assert freq >= clko_freq_min
+        assert freq <= clko_freq_max
+        assert self.nclkouts < self.nclkouts_max
+        self.clkouts[self.nclkouts] = (cd.clk, freq, phase, margin)
+        create_clkout_log(self.logger, cd.name, freq, margin, self.nclkouts)
+        self.clk_names.append(cd.name)
+        self.nclkouts += 1
+
+    def compute_config(self):
+        config = {}
+        for clki_div in range(*self.clki_div_range):
+            config["clki_div"] = clki_div
+            for clkfb_div in range(*self.clkfb_div_range):
+                all_valid = True
+                vco_freq = self.clkin_freq/clki_div*clkfb_div
+                (vco_freq_min, vco_freq_max) = self.vco_out_freq_range
+                if vco_freq >= vco_freq_min and vco_freq <= vco_freq_max:
+                    for n, (clk, f, p, m) in sorted(self.clkouts.items()):
+                        valid = False
+                        for d in range(*self.clko_div_range):
+                            clk_freq = vco_freq/d
+                            if abs(clk_freq - f) <= f*m:
+                                config["clko{}_freq".format(n)]  = clk_freq
+                                config["clko{}_div".format(n)]   = d
+                                config["clko{}_phase".format(n)] = p
+                                valid = True
+                                break
+                        if not valid:
+                            all_valid = False
+                else:
+                    all_valid = False
+                if all_valid:
+                    config["vco"] = vco_freq
+                    config["clkfb_div"] = clkfb_div
+                    compute_config_log(self.logger, config)
+                    return config
+        raise ValueError("No PLL config found")
+
+    def calculate_analog_parameters(self, clki_freq, fb_div, bw_factor = 5):
+        config = {}
+
+        params = self.calc_optimal_params(clki_freq, fb_div, 1, bw_factor)
+        config["p_CSET"]            = params["CSET"]
+        config["p_CRIPPLE"]         = params["CRIPPLE"]
+        config["p_V2I_PP_RES"]      = params["V2I_PP_RES"]
+        config["p_IPP_SEL"]         = params["IPP_SEL"]
+        config["p_IPP_CTRL"]        = params["IPP_CTRL"]
+        config["p_BW_CTL_BIAS"]     = params["BW_CTL_BIAS"]
+        config["p_IPI_CMP"]         = params["IPI_CMP"]
+
+        return config
+
+    def do_finalize(self):
+        config = self.compute_config()
+        clkfb  = Signal()
+
+        self.params.update(
+            p_V2I_PP_ICTRL      = "0b11111", # Hard coded in all reference files
+            p_IPI_CMPN          = "0b0011", # Hard coded in all reference files
+
+            p_V2I_1V_EN         = "ENABLED", # Enabled = 1V (Default in references, but not the primitive), Disabled = 0.9V
+            p_V2I_KVCO_SEL      = "60", # if (VOLTAGE == 0.9V) 85 else 60
+            p_KP_VCO            = "0b00011", # if (VOLTAGE == 0.9V) 0b11001 else 0b00011
+
+            p_PLLPD_N           = "USED",
+            p_PLLRESET_ENA      = "ENABLED",
+            p_REF_INTEGER_MODE  = "ENABLED", # Ref manual has a discrepency so lets always set this value just in case
+            p_REF_MMD_DIG       = "1", # Divider for the input clock, ie 'M'
+
+            i_PLLRESET          = self.reset,
+            i_REFCK             = self.clkin,
+            o_LOCK              = self.locked,
+
+            # Use CLKOS5 & divider for feedback
+            p_SEL_FBK           = "FBKCLK5",
+            p_ENCLK_CLKOS5      = "ENABLED",
+            p_DIVF              = str(config["clkfb_div"]-1), # str(Actual value - 1)
+            p_DELF              = str(config["clkfb_div"]-1),
+            p_CLKMUX_FB         = "CMUX_CLKOS5",
+            i_FBKCK             = clkfb,
+            o_CLKOS5            = clkfb,
+
+            # Set feedback divider to 1
+            p_FBK_INTEGER_MODE  = "ENABLED",
+            p_FBK_MASK          = "0b00000000",
+            p_FBK_MMD_DIG       = "1",
+        )
+
+        analog_params = self.calculate_analog_parameters(self.clkin_freq, config["clkfb_div"])
+        self.params.update(analog_params)
+        n_to_l = {0: "P", 1: "S", 2: "S2", 3:"S3", 4:"S4"}
+
+        enables = []
+        for n, (clk, f, p, m) in sorted(self.clkouts.items()):
+            div    = config["clko{}_div".format(n)]
+            phase = int((1+p/360) * div)
+            letter = chr(n+65)
+            self.params["p_ENCLK_CLKO{}".format(n_to_l[n])] = "ENABLED"
+            self.params["p_DIV{}".format(letter)] = str(div-1)
+            self.params["p_PHI{}".format(letter)] = "0"
+            self.params["p_DEL{}".format(letter)] = str(phase - 1)
+            self.params["o_CLKO{}".format(n_to_l[n])] = clk
+            enables.append((self.clk_names[n], 1))
+
+            # In theory this really shouldn't be necessary, in practice
+            # the tooling seems to have suspicous clock latency values
+            # on generated clocks that are causing timing problems and Lattice
+            # hasn't responded to my support requests on the matter.
+            if self.platform and self.create_output_port_clocks:
+                self.platform.add_platform_command("create_clock -period {} -name {} [get_pins {}.PLL_inst/CLKO{}]".format(str(1/f*1e9), self.name + "_" + n_to_l[n],self.name, n_to_l[n]))
+
+        self.enable = Record(enables)
+        for n, _ in sorted(self.clkouts.items()):
+            self.params["i_ENCLKO{}".format(n_to_l[n])] = getattr(self.enable, self.clk_names[n])
+
+        if self.platform and self.create_output_port_clocks:
+            i = 0
+        self.specials += Instance("PLL", name = self.name, **self.params)
+
+    # The gist of calculating the analog parameters is to run through all the
+    # permutations of the parameters and find the optimum set of values based
+    # on the transfer function of the PLL loop filter. There are constraints on
+    # on a few specific parameters, the open loop transfer function, and the closed loop
+    # transfer function. An optimal solution is chosen based on the bandwidth
+    # of the response relative to the input reference frequency of the PLL.
+
+    # Later revs of the Lattice calculator BW_FACTOR is set to 10, may need to change it
+    def calc_optimal_params(self, fref, fbkdiv, M = 1, BW_FACTOR = 5):
+        print("Calculating Analog Paramters for a reference freqeuncy of " + str(fref*1e-6) +
+              " Mhz, feedback div " + str(fbkdiv) + ", and input div " + str(M) + "."
+        )
+
+        best_params = None
+        best_3db = 0
+
+        for params in self.transfer_func_coefficients:
+            closed_loop_peak = self.closed_loop_peak(fbkdiv, params)
+            if (closed_loop_peak["peak"] < 0.8 or
+               closed_loop_peak["peak"] > 1.35):
+                continue
+
+            open_loop_crossing = self.open_loop_crossing(fbkdiv, params)
+            if open_loop_crossing["phase"] <= 45:
+                continue
+
+            closed_loop_3db = self.closed_loop_3db(fbkdiv, params)
+            bw_factor = fref*1e6 / M / closed_loop_3db["f"]
+            if bw_factor < BW_FACTOR:
+                continue
+
+            if best_3db < closed_loop_3db["f"]:
+                best_3db = closed_loop_3db["f"]
+                best_params = params
+
+        print("Done calculating analog parameters:")
+        HDL_params = self.numerical_params_to_HDL_params(best_params)
+        pprint.pprint(HDL_params)
+
+        return HDL_params
+
+
+    def numerical_params_to_HDL_params(self, params):
+        IPP_SEL_LUT = {1: 1, 2: 3, 3: 7, 4: 15}
+        ret = {
+            "CRIPPLE": str(int(params.CRIPPLE / 1e-12)) + "P",
+            "CSET": str(int((params.CSET / 4e-12)*4)) + "P",
+            "V2I_PP_RES": "{0:g}".format(params.V2I_PP_RES/1e3).replace(".","P") + "K",
+            "IPP_CTRL": "0b{0:04b}".format(int(params.IPP_CTRL / 1e-6 + 3)),
+            "IPI_CMP": "0b{0:04b}".format(int(params.IPI_CMP / .5e-6)),
+            "BW_CTL_BIAS": "0b{0:04b}".format(params.BW_CTL_BIAS),
+            "IPP_SEL": "0b{0:04b}".format(IPP_SEL_LUT[params.IPP_SEL]),
+        }
+
+        return ret
+
+    def calc_valid_io_i2(self):
+        # Valid permutations of IPP_CTRL, BW_CTL_BIAS, IPP_SEL, and IPI_CMP paramters are constrained
+        # by the following equation so we can narrow the problem space by calculating the
+        # them early in the process.
+        # ip = 5.0/3 * ipp_ctrl*bw_ctl_bias*ipp_sel
+        # ip/ipi_cmp == 50 +- 1e-4
+
+        self.valid_io_i2_permutations = []
+
+        # List out the valid values of each parameter
+        IPP_CTRL_VALUES = range(1,4+1)
+        IPP_CTRL_UNITS = 1e-6
+        IPP_CTRL_VALUES = [element * IPP_CTRL_UNITS for element in IPP_CTRL_VALUES]
+        BW_CTL_BIAS_VALUES = range(1,15+1)
+        IPP_SEL_VALUES = range(1,4+1)
+        IPI_CMP_VALUES = range(1,15+1)
+        IPI_CMP_UNITS = 0.5e-6
+        IPI_CMP_VALUES = [element * IPI_CMP_UNITS for element in IPI_CMP_VALUES]
+
+        for IPP_CTRL in IPP_CTRL_VALUES:
+            for BW_CTL_BIAS in BW_CTL_BIAS_VALUES:
+                for IPP_SEL in IPP_SEL_VALUES:
+                    for IPI_CMP in IPI_CMP_VALUES:
+                        is_valid_io_i2 = self.is_valid_io_i2(IPP_CTRL, BW_CTL_BIAS, IPP_SEL, IPI_CMP)
+                        if is_valid_io_i2 and self.is_unique_io(is_valid_io_i2['io']):
+                            self.valid_io_i2_permutations.append( io_i2(
+                                is_valid_io_i2['io'], is_valid_io_i2['i2'],
+                                IPP_CTRL, BW_CTL_BIAS, IPP_SEL
+                            ) )
+
+    def is_unique_io(self, io):
+        return not any(x.io == io for x in self.valid_io_i2_permutations)
+
+    def is_valid_io_i2(self, IPP_CTRL, BW_CTL_BIAS, IPP_SEL, IPI_CMP):
+        tolerance = 1e-4
+        ip = 5.0/3.0 * IPP_CTRL * BW_CTL_BIAS * IPP_SEL
+        i2 = IPI_CMP
+        if abs(ip/i2-50) < tolerance:
+            return {'io':ip,'i2':i2}
+        else:
+            return False
+
+    def calc_tf_coefficients(self):
+        # Take the permutations of the various analog parameters
+        # then precalculate the coefficients of the transfer function.
+        # During the final calculations sub in the feedback divisor
+        # to get the final transfer functions.
+
+        #       (ABF+EC)s^2 + (A(F(G+1)+B) + ED)s + A(G+1)          C1s^s + C2s + C3
+        # tf = -------------------------------------------- =  --------------------------
+        #               ns^2(CFs^2 + (DF+C)s + D)                ns^2(C4s^2 + C5s + C6)
+
+        # A = i2*g3*ki
+        # B = r1*c3
+        # C = B*c2
+        # D = c2+c3
+        # E = io*ki*k1
+        # F = r*cs
+        # G = k3
+        # n = total divisor of the feedback signal (output + N)
+
+        # Constants
+        c3 = 20e-12
+        g3 = 0.2952e-3
+        k1 = 6
+        k3 = 100
+        ki = 508e9
+        r1 = 9.8e6
+        B = r1*c3
+
+        # PLL Parameters
+        CSET_VALUES = range(2,17+1)
+        CSET_UNITS = 4e-12
+        CSET_VALUES = [element * CSET_UNITS for element in CSET_VALUES]
+        CRIPPLE_VALUES = [1, 3, 5, 7, 9, 11, 13, 15]
+        CRIPPLE_UNITS = 1e-12
+        CRIPPLE_VALUES = [element * CRIPPLE_UNITS for element in CRIPPLE_VALUES]
+        V2I_PP_RES_VALUES = [9000, 9300, 9700, 10000, 10300, 10700, 11000, 11300]
+
+        self.transfer_func_coefficients = []
+
+        # Run through all the permutations and cache it all
+        for io_i2 in self.valid_io_i2_permutations:
+            for CSET in CSET_VALUES:
+                for CRIPPLE in CRIPPLE_VALUES:
+                    for V2I_PP_RES in V2I_PP_RES_VALUES:
+                        A = io_i2.i2*g3*ki
+                        B = r1*c3
+                        C = B*CSET
+                        D = CSET+c3
+                        E = io_i2.io*ki*k1
+                        F = V2I_PP_RES*CRIPPLE
+                        G = k3
+
+                        self.transfer_func_coefficients.append( nx_pll_param_permutation(
+                            A*B*F+E*C, # C1
+                            A*(F*(G+1)+B)+E*D, # C2
+                            A*(G+1), # C3
+                            C*F, # C4
+                            D*F+C, # C5
+                            D, # C6
+                            io_i2.IPP_CTRL, io_i2.BW_CTL_BIAS, io_i2.IPP_SEL,
+                            CSET, CRIPPLE, V2I_PP_RES, io_i2.i2
+                        ))
+
+    def calc_tf(self, n, s, params):
+        return ( (params.C1 * s ** 2 + params.C2 * s + params.C3) /
+                ( n * s ** 2 * (params.C4 * s ** 2 + params.C5 * s + params.C6) ) )
+
+    def closed_loop_peak(self, fbkdiv, params):
+        f = 1e6
+        step = 1.1
+        step_divs = 0
+
+        peak_value = -99
+        peak_f = 0
+
+        last_value = -99
+
+        while f < 1e9:
+            s = 1j * 2 * pi * f
+            tf_value = self.calc_tf(fbkdiv, s, params)
+            this_result = 20*log10(abs(tf_value/(1+tf_value)))
+            if this_result > peak_value:
+                peak_value = this_result
+                peak_f = f
+
+            if this_result < last_value and step_divs < 5:
+                f = f/(step**2)
+                step = (step - 1) * .5 + 1
+                step_divs = step_divs + 1
+            elif this_result < last_value and step_divs == 5:
+                break
+            else:
+                last_value = this_result
+                f = f * step
+
+        return {"peak":peak_value, "peak_freq":peak_f}
+
+    def closed_loop_3db(self, fbkdiv, params):
+        f = 1e6
+        step = 1.1
+        step_divs = 0
+
+        last_f = 1
+
+        while f < 1e9:
+            s = 1j * 2 * pi * f
+            tf_value = self.calc_tf(fbkdiv, s, params)
+            this_result = 20*log10(abs(tf_value/(1+tf_value)))
+
+            if (this_result+3) < 0 and step_divs < 5:
+                f = last_f
+                step = (step - 1) * .5 + 1
+                step_divs = step_divs + 1
+            elif (this_result+3) < 0 and step_divs == 5:
+                break
+            else:
+                last_f = f
+                f = f * step
+
+        return {"f":last_f}
+
+    def open_loop_crossing(self, fbkdiv, params):
+        f = 1e6
+        step = 1.1
+        step_divs = 0
+
+        last_f = 1
+        last_tf = 0
+
+        while f < 1e9:
+            s = 1j * 2 * pi * f
+            tf_value = self.calc_tf(fbkdiv, s, params)
+            this_result = 20*log10(abs(tf_value))
+
+            if this_result < 0 and step_divs < 5:
+                f = last_f
+                step = (step - 1) * .5 + 1
+                step_divs = step_divs + 1
+            elif this_result < 0 and step_divs == 5:
+                break
+            else:
+                last_f = f
+                last_tf = tf_value
+                f = f * step
+
+        return {"f":last_f, "phase":phase(-last_tf)*180/pi}
diff --git a/soc/hps_proto2_platform.py b/soc/hps_proto2_platform.py
index 9bf2ae446..d8542b66c 100644
--- a/soc/hps_proto2_platform.py
+++ b/soc/hps_proto2_platform.py
@@ -9,7 +9,7 @@
 from litex.build.lattice.programmer import LatticeProgrammer
 from litex.soc.cores.clock import NXOSCA
 # from litex.soc.cores.ram import NXLRAM
-from hps_lattice_nx import NXLRAM
+from hps_lattice_nx import NXLRAM, NXPLL
 
 hps_io = [
     ("done", 0, Pins("A5"), IOStandard("LVCMOS18H")),
@@ -64,25 +64,47 @@ class _CRG(Module):
     """Clock Reset Generator"""
 
     def __init__(self, platform, sys_clk_freq):
-        self.clock_domains.cd_sys = ClockDomain()
+        # Input for PLL
         self.clock_domains.cd_por = ClockDomain()
 
+        # Outputs from PLL
+        self.clock_domains.cd_sys = ClockDomain()
+        self.clock_domains.cd_cfu = ClockDomain()
+
+        # PLL output clocks' enable signals
+        self.sys_clk_enable = Signal(reset=1)
+        self.cfu_clk_enable = Signal(reset=1)
+
         # Clock from HFOSC
-        self.submodules.sys_clk = sys_osc = NXOSCA()
-        sys_osc.create_hf_clk(self.cd_sys, sys_clk_freq)
+        self.submodules.osc_clk = sys_osc = NXOSCA()
+        sys_osc.create_hf_clk(self.cd_por, sys_clk_freq)
+
         # We make the period constraint 7% tighter than our actual system
         # clock frequency, because the CrossLink-NX internal oscillator runs
         # at ±7% of nominal frequency.
-        platform.add_period_constraint(self.cd_sys.clk,
-                                       1e9 / (sys_clk_freq * 1.07))
+        clk_freq = sys_clk_freq * 1.07
 
         # Power On Reset
         por_cycles = 4096
         por_counter = Signal(log2_int(por_cycles), reset=por_cycles - 1)
-        self.comb += self.cd_por.clk.eq(self.cd_sys.clk)
         self.sync.por += If(por_counter != 0, por_counter.eq(por_counter - 1))
-        self.specials += AsyncResetSynchronizer(
-            self.cd_sys, (por_counter != 0))
+
+        # PLL
+        self.submodules.sys_pll = sys_pll = NXPLL(platform=platform, create_output_port_clocks=True)
+        sys_pll.register_clkin(self.cd_por.clk, clk_freq)
+        sys_pll.create_clkout(self.cd_sys, clk_freq)
+        sys_pll.create_clkout(self.cd_cfu, clk_freq)
+
+        self.specials += [
+            AsyncResetSynchronizer(self.cd_sys, ~self.sys_pll.locked | (por_counter != 0)),
+            AsyncResetSynchronizer(self.cd_cfu, ~self.sys_pll.locked | (por_counter != 0)),
+        ]
+
+    def do_finalize(self):
+        self.comb += [
+            self.sys_pll.enable.sys.eq(self.sys_clk_enable),
+            self.sys_pll.enable.cfu.eq(self.cfu_clk_enable),
+        ]
 
 
 _nextpnr_report_filename = 'nextpnr-nexus-report.json'
diff --git a/soc/hps_soc.py b/soc/hps_soc.py
index 1c7104d32..8dd454b0f 100755
--- a/soc/hps_soc.py
+++ b/soc/hps_soc.py
@@ -35,7 +35,8 @@
 from litespi.phy.generic import LiteSPIPHY
 from litespi import LiteSPI
 
-from migen import Module, Instance, Signal, Record
+from migen import Module, Signal, Record, ClockSignal, ResetSignal, \
+                  ClockDomainsRenamer, FSM, NextValue, NextState, If
 
 from patch import Patch
 # from cam_control import CameraControl
@@ -54,6 +55,73 @@
 SOC_DIR = os.path.dirname(os.path.realpath(__file__))
 
 
+class CfuCpuClockCtrl(Module):
+    """
+    A module that controls clocks between CFU and CPU so that power usage is
+    optimized.
+    """
+
+    def __init__(self, cfu_bus, cfu_cen, cpu_cen):
+        """Constructor
+
+        Args:
+            cfu_bus: bus between CFU and CPU
+            cfu_cen: clock enable signal for CFU
+            cpu_cen: clock enable signal for CPU
+        """
+        self.cfu_bus = cfu_bus
+        self.cfu_cen = cfu_cen
+        self.cpu_cen = cpu_cen
+
+        delay = Signal(max=10)
+
+        self.submodules.fsm = fsm = FSM()
+        # It's just an initial state, once left it's not entered again
+        fsm.act("RESET",
+            # Enable both CFU and CPU
+            self.cfu_cen.eq(1),
+            self.cpu_cen.eq(1),
+
+            NextValue(delay, delay + 1),
+
+            # After 10 cycles disable CFU and go to next state
+            # Leave immediately if CPU has a command prepared
+            If(delay == 9,
+                NextState("CPU_ENABLED"),
+            ).Elif(self.cfu_bus.cmd.valid,
+                NextState("CFU_ENABLED")
+            )
+        )
+
+        fsm.act("CPU_ENABLED",
+            self.cpu_cen.eq(1),
+            self.cfu_cen.eq(0),
+
+            # If CPU has prepared a command, enable CFU
+            If(self.cfu_bus.cmd.valid,
+                self.cfu_cen.eq(1),
+                NextState("CFU_ENABLED"),
+            )
+        )
+
+        fsm.act("CFU_ENABLED",
+            self.cfu_cen.eq(1),
+            self.cpu_cen.eq(1),
+
+            # Disable CPU if CFU received a command (cmd.valid deasserted)
+            # And CFU didn't already respond to that command
+            If(~self.cfu_bus.rsp.valid & ~self.cfu_bus.cmd.valid,
+                self.cpu_cen.eq(0),
+            ),
+
+            # If CFU has prepared a response, enable CPU
+            If(self.cfu_bus.rsp.valid,
+                self.cpu_cen.eq(1),
+                NextState("CPU_ENABLED"),
+            )
+        )
+
+
 class HpsSoC(LiteXSoC):
     # Memory layout
     csr_origin = 0xf0000000
@@ -98,6 +166,19 @@ def __init__(self, platform, debug, variant=None,
                      reset_address=reset_address,
                      cfu=cpu_cfu)
 
+        cfu_cen = Signal()
+        cpu_cen = Signal()
+        self.comb += [
+            self.crg.sys_clk_enable.eq(cpu_cen),
+            self.crg.cfu_clk_enable.eq(cfu_cen)
+        ]
+
+        self.cpu.cfu_params.update(i_clk=ClockSignal("cfu"))
+        self.cpu.cfu_params.update(i_reset=ResetSignal("cfu"))
+
+        self.submodules.cfu_cpu_clk_ctl = ClockDomainsRenamer("por")(
+                CfuCpuClockCtrl(self.cpu.cfu_bus, cfu_cen, cpu_cen))
+
         # RAM
         if separate_arena:
             ram_size = 64*KB
@@ -149,7 +230,7 @@ def __init__(self, platform, debug, variant=None,
 
     def setup_ram(self, size):
         region = SoCRegion(self.sram_origin, size, cached=True, linker=True)
-        self.submodules.lram = self.platform.create_ram(32, size)
+        self.submodules.lram = ClockDomainsRenamer("por")(self.platform.create_ram(32, size))
         self.bus.add_slave("sram_lram", self.lram.bus, region)
         self.bus.add_region("sram", region)
 
@@ -158,7 +239,7 @@ def setup_arena(self, size):
         region = SoCRegion(self.arena_origin, size, cached=True, linker=True)
         self.bus.add_region("arena", region)
         if size > 0:
-            self.submodules.arena = self.platform.create_ram(32, size, dual_port=True)
+            self.submodules.arena = ClockDomainsRenamer("por")(self.platform.create_ram(32, size, dual_port=True))
             self.bus.add_slave("arena_lram", self.arena.bus, region)
             self.add_config('SOC_SEPARATE_ARENA')