restructuring the data frame to build a scatter plot

stillwater-sc · Dec 8, 2024 · a564f88 · a564f88
1 parent 4d1f988
commit a564f88
Show file tree

Hide file tree

Showing 8 changed files with 230 additions and 147 deletions.
diff --git a/data/old.txt b/data/old.txt
@@ -0,0 +1,27 @@
+
+
+self.add_8b = 0.03
+self.add_16b = 0.05
+self.add_32b = 0.1
+
+self.mul_8b = 0.2
+self.mul_16b = 1.0  # estimated
+self.mul_32b = 3.1
+
+self.add_fp8 = 0.2  # estimated
+self.add_fp16 = 0.4
+self.add_fp32 = 0.9
+
+self.mul_fp8 = 0.4 # estimated
+self.mul_fp16 = 1.1
+self.mul_fp32 = 3.7
+
+# assuming 0.2pJ per bit
+self.register_read_8b = 1.6
+self.register_read_16b = 3.2
+self.register_read_32b = 6.4
+
+# assuming 0.3pJ per bit
+self.register_write_8b = 2.4
+self.register_write_16b = 4.8
+self.register_write_32b = 9.6
diff --git a/data/process_n14.txt b/data/process_n14.txt
@@ -0,0 +1,90 @@
+For a typical 14nm CMOS CPU:
+Instruction Fetch:
+ 5-10 pJ per instruction
+Depends on cache hierarchy, fetch width
+
+Instruction Decode:
+ 2-5 pJ per instruction
+Complexity varies with instruction set architecture
+
+Instruction Dispatch:
+ 3-7 pJ per instruction
+Influenced by out-of-order execution complexity
+
+Total energy for these stages: ~10-20 pJ per instruction, with significant variation based on specific microarchitectural design.
+
+For a typical 14nm CMOS register file:
+
+Read energy: 0.2-0.4 pJ per bit
+Influenced by register size, typically 32-128 bits per register
+Specific design can reduce energy to ~0.1 pJ per bit with advanced techniques
+
+Write energy: 0.3-0.6 pJ per bit
+Slightly higher than read energy due to additional charging/state-setting requirements
+Dependent on write port design, bitline capacitance, and voltage
+
+For a typical 14nm CMOS L1 cache
+Read energy: 1-3 pJ per bit
+Write energy: 2-4 pJ per bit.
+Energy varies with:
+  Cache size (typically 32-64 KB)
+  Associativity
+  Access transistor design
+  Bitline and wordline capacitances
+
+32byte cacheline read  = 32*8*[1,1.5,2,3pJ] = 256, 384, 512, 768pJ
+32byte cacheline write = 32*8*[2,3,4pJ] = 512, 768, 1024pJ
+
+For a DDR5 cacheline read in a 14nm CMOS system:
+
+Typical cacheline size: 64 bytes (512 bits)
+Energy per DDR5 read: Approximately 10-20 picojoules (pJ) per bit
+Total cacheline read energy: ~5-10 nanojoules (nJ)
+
+For a DDR5 cacheline write in a 14nm CMOS system:
+
+Typical cacheline size: 64 bytes (512 bits)
+Energy per DDR5 write: Approximately 15-25 picojoules (pJ) per bit
+Total cacheline write energy: ~8-13 nanojoules (nJ)
+
+The write energy is slightly higher than read energy due to:
+ Additional signal transitions
+ Power required to change memory state
+ Driving write circuitry
+ Increased signal conditioning requirements
+
+Factors affecting write energy include:
+ Memory controller design
+ Write amplifier circuits
+ Signal integrity techniques
+ Operating frequency
+ Physical transmission distance
+
+32byte DDR5 read: 32*8*[10, 15, 20pJ] = 2560, 3840, 5120pJ
+32byte DDR5 write: 32*8*[15, 20, 25pJ] = 3840, 5120, 6400pJ
+
+CPUs (14nm TSMC):
+
+Typical clock frequencies: 2.5 - 3.5 GHz
+Cycle times: Approximately 0.28 to 0.4 nanoseconds (ns)
+Specific ranges depend on performance vs. power optimization
+High-performance cores tend to be closer to 0.28 ns
+Power-efficient cores tend to be closer to 0.4 ns
+
+GPUs (14nm TSMC):
+
+Typical clock frequencies: 1.0 - 1.8 GHz
+Cycle times: Approximately 0.55 to 1.0 nanoseconds (ns)
+Lower clock speeds compared to CPUs due to more complex parallel processing architectures
+Emphasis on throughput rather than single-thread performance
+
+DSPs (14nm TSMC):
+
+Typical clock frequencies: 1.5 - 2.5 GHz
+Cycle times: Approximately 0.4 to 0.66 nanoseconds (ns)
+Optimized for specific signal processing tasks
+Cycle times can vary based on specific DSP architecture and design goals
+
+These figures represent typical ranges and can vary based on specific implementation,
+target application, and design optimization strategies. The actual cycle time is influenced
+by factors like standard cell library, logic depth, clock tree design, and specific performance requirements.
diff --git a/data/process_n7.txt b/data/process_n7.txt
@@ -0,0 +1,55 @@
+CPU Register Read for 7nm
+
+
+Energy: Approximately 0.1 to 0.3 picojoules (pJ)
+Very low energy consumption due to minimal capacitive switching
+Typically involves small, tightly-coupled storage elements
+
+
+L1 Cache Read:
+
+
+Energy: Around 1.0 to 2.5 picojoules (pJ)
+Higher energy compared to register read due to:
+
+Larger array size
+More complex sense amplifiers
+Longer bit lines
+Additional peripheral circuitry
+
+
+
+
+Instruction Fetch, Decode, and Dispatch:
+
+
+Total Energy: Approximately 5 to 10 picojoules (pJ)
+Breakdown:
+
+Fetch: 2-4 pJ
+Decode: 1-3 pJ
+Dispatch: 1-3 pJ
+
+
+Energy varies based on instruction complexity
+More complex instructions (e.g., vector or SIMD) consume more energy
+
+
+
+In a 7nm process node for a typical CPU, an L2 cache read energy is approximately 6 to 12 picojoules (pJ).
+The specific factors influencing L2 read energy include:
+
+Cache array size (typically 256KB to 1MB per core)
+Dynamic and static power characteristics of 7nm FinFET transistors
+Bit line length and capacitance
+Sense amplifier design
+Peripheral circuitry complexity
+
+The higher energy compared to L1 cache reads reflects:
+
+Larger physical array size
+Longer interconnect distances
+More complex peripheral circuits
+Additional tag and data matching logic
+
+The lower end of the range (6-8 pJ) represents more power-optimized designs, while the higher end (10-12 pJ) is typical for high-performance implementations prioritizing speed over energy efficiency.
diff --git a/data/spm_energy.csv b/data/spm_energy.csv
@@ -1,16 +1,15 @@
-node,           n14,       n14,     n14
-sample          low,   typical,    high
-cycle_time,    0.40,      0.34,    0.28
-fetch,         5.00,      7.50,   10.00
-decode,        2.00,      3.50,    5.00
-dispatch,      3.00,      5.00,    7.00
-reg_read,      0.20,      0.30,    0.40
-reg_write,     0.30,      0.45,    0.60
-l1_read,       1.00,      2.00,    3.00
-l1_write,      2.00,      3.00,    4.00
-l2_read,       2.00,      4.00,    6.00
-l2_write,      4.00,      6.00,    8.00
-l3_read,       4.00,      6.00,    8.00
-l3_write,      8.00,     12.00,   16.00
-mem_read,     10.00,     15.00,   20.00
-mem_write,    15.00,     20.00,   25.00
+node,          n14l,      n14t,    n14h,    n7l,       n7t,     n7h
+cycle_time,    0.40,      0.34,    0.28,   0.35,      0.30,    0.25
+fetch,         5.00,      7.50,   10.00,   2.00,      3.00,    4.00
+decode,        2.00,      3.50,    5.00,   1.00,      2.00,    3.00
+dispatch,      3.00,      5.00,    7.00,   1.00,      2.00,    3.00
+reg_read,      0.20,      0.30,    0.40,   0.10,      0.20,    0.30
+reg_write,     0.30,      0.45,    0.60,   0.15,      0.30,    0.45
+l1_read,       1.00,      2.00,    3.00,   1.00,      1.75,    2.50
+l1_write,      2.00,      3.00,    4.00,   2.00,      3.00,    4.00
+l2_read,       2.00,      4.00,    6.00,   2.00,      4.00,    6.00
+l2_write,      4.00,      6.00,    8.00,   4.00,      6.00,    8.00
+l3_read,       4.00,      6.00,    8.00,   4.00,      6.00,    8.00
+l3_write,      8.00,     12.00,   16.00,   8.00,     12.00,   16.00
+mem_read,     10.00,     15.00,   20.00,  10.00,     15.00,   20.00
+mem_write,    15.00,     20.00,   25.00,  15.00,     20.00,   25.00
diff --git a/energysim/database/spm_energy.py b/energysim/database/spm_energy.py
@@ -36,7 +36,6 @@ def __init__(self, data_source: str):
         """
         self.data_source = data_source
         self.data = None
-        self.node = None
         self.cycle_time_ns = None
 
     def load_data(self) -> pd.DataFrame:
@@ -45,136 +44,24 @@ def load_data(self) -> pd.DataFrame:
 
         :return: Loaded DataFrame
         """
-        self.data = pd.read_csv(self.data_source)
-        return self.data
+        self.data = pd.read_csv(self.data_source, skipinitialspace=True)
+        return pd.DataFrame(self.data)
 
     # generate will create a set of energy values that the operator models will use to calculate
     # energy consumption of the operator
     def generate(self, node: str) -> StoredProgramMachineEventEnergy:
-        self.node = node
-
+        # query the database
+        # df = self.data.query('node == @node')
+        df = pd.DataFrame(self.data)
+        print(df.columns)
+        n14t = (df[["n14t"]])
+        print(n14t)
         return StoredProgramMachineEventEnergy(node)
 
 
 
 
-        # self.add_8b = 0.03
-        # self.add_16b = 0.05
-        # self.add_32b = 0.1
-        #
-        # self.mul_8b = 0.2
-        # self.mul_16b = 1.0  # estimated
-        # self.mul_32b = 3.1
-        #
-        # self.add_fp8 = 0.2  # estimated
-        # self.add_fp16 = 0.4
-        # self.add_fp32 = 0.9
-        #
-        # self.mul_fp8 = 0.4 # estimated
-        # self.mul_fp16 = 1.1
-        # self.mul_fp32 = 3.7
-        #
-        # # assuming 0.2pJ per bit
-        # self.register_read_8b = 1.6
-        # self.register_read_16b = 3.2
-        # self.register_read_32b = 6.4
-        #
-        # # assuming 0.3pJ per bit
-        # self.register_write_8b = 2.4
-        # self.register_write_16b = 4.8
-        # self.register_write_32b = 9.6
-
-
-
-
-        # For a typical 14nm CMOS CPU:
-        # Instruction Fetch:
-        #  5-10 pJ per instruction
-        # Depends on cache hierarchy, fetch width
-        #
-        # Instruction Decode:
-        #  2-5 pJ per instruction
-        # Complexity varies with instruction set architecture
-        #
-        # Instruction Dispatch:
-        #  3-7 pJ per instruction
-        # Influenced by out-of-order execution complexity
-        #
-        # Total energy for these stages: ~10-20 pJ per instruction, with significant variation based on specific microarchitectural design.
-
-        # For a typical 14nm CMOS register file:
-        #
-        # Read energy: 0.2-0.4 pJ per bit
-        # Influenced by register size, typically 32-128 bits per register
-        # Specific design can reduce energy to ~0.1 pJ per bit with advanced techniques
-        #
-        # Write energy: 0.3-0.6 pJ per bit
-        # Slightly higher than read energy due to additional charging/state-setting requirements
-        # Dependent on write port design, bitline capacitance, and voltage
-
-        # For a typical 14nm CMOS L1 cache
-        # Read energy: 1-3 pJ per bit
-        # Write energy: 2-4 pJ per bit.
-        # Energy varies with:
-        #   Cache size (typically 32-64 KB)
-        #   Associativity
-        #   Access transistor design
-        #   Bitline and wordline capacitances
-        #
-        # 32byte cacheline read  = 32*8*[1,1.5,2,3pJ] = 256, 384, 512, 768pJ
-        # 32byte cacheline write = 32*8*[2,3,4pJ] = 512, 768, 1024pJ
-
-        # For a DDR5 cacheline read in a 14nm CMOS system:
-        #
-        # Typical cacheline size: 64 bytes (512 bits)
-        # Energy per DDR5 read: Approximately 10-20 picojoules (pJ) per bit
-        # Total cacheline read energy: ~5-10 nanojoules (nJ)
-        #
-        # For a DDR5 cacheline write in a 14nm CMOS system:
-        #
-        # Typical cacheline size: 64 bytes (512 bits)
-        # Energy per DDR5 write: Approximately 15-25 picojoules (pJ) per bit
-        # Total cacheline write energy: ~8-13 nanojoules (nJ)
-        #
-        # The write energy is slightly higher than read energy due to:
-        #  Additional signal transitions
-        #  Power required to change memory state
-        #  Driving write circuitry
-        #  Increased signal conditioning requirements
-        #
-        # Factors affecting write energy include:
-        #  Memory controller design
-        #  Write amplifier circuits
-        #  Signal integrity techniques
-        #  Operating frequency
-        #  Physical transmission distance
-
-        # 32byte DDR5 read: 32*8*[10, 15, 20pJ] = 2560, 3840, 5120pJ
-        # 32byte DDR5 write: 32*8*[15, 20, 25pJ] = 3840, 5120, 6400pJ
-
-        # CPUs (14nm TSMC):
-        #
-        # Typical clock frequencies: 2.5 - 3.5 GHz
-        # Cycle times: Approximately 0.28 to 0.4 nanoseconds (ns)
-        # Specific ranges depend on performance vs. power optimization
-        # High-performance cores tend to be closer to 0.28 ns
-        # Power-efficient cores tend to be closer to 0.4 ns
-        #
-        # GPUs (14nm TSMC):
-        #
-        # Typical clock frequencies: 1.0 - 1.8 GHz
-        # Cycle times: Approximately 0.55 to 1.0 nanoseconds (ns)
-        # Lower clock speeds compared to CPUs due to more complex parallel processing architectures
-        # Emphasis on throughput rather than single-thread performance
-        #
-        # DSPs (14nm TSMC):
-        #
-        # Typical clock frequencies: 1.5 - 2.5 GHz
-        # Cycle times: Approximately 0.4 to 0.66 nanoseconds (ns)
-        # Optimized for specific signal processing tasks
-        # Cycle times can vary based on specific DSP architecture and design goals
-        #
-        # These figures represent typical ranges and can vary based on specific implementation,
-        # target application, and design optimization strategies. The actual cycle time is influenced
-        # by factors like standard cell library, logic depth, clock tree design, and specific performance requirements.
+
+
+
 
diff --git a/energysim/execution/spm.py b/energysim/execution/spm.py
@@ -28,6 +28,10 @@ def __init__(self, name: str):
         self.memory = 0
         self.data_movement = 0
         self.total = 0
+        # performance metrics
+        self.TIPS = 0   # instructions per second
+        self.TOPS = 0   # floating point operations per second
+        self.MemGOPS = 0   # memory operations per second
 
     def report(self):
         total_compute = self.instruction + self.execute + self.register_read + self.register_write
@@ -50,6 +54,8 @@ def report(self):
         self.data_movement = total_data_movement
         self.total = total_energy
 
+
+
         if total_data_movement > 0:
             l1_read_relative = self.l1_read / total_data_movement
             l2_read_relative = self.l2_read / total_data_movement