WIP: cross validation experiments for SPM and GPU performance

stillwater-sc · Dec 16, 2024 · 1a21549 · 1a21549
1 parent 39ea407
commit 1a21549
Show file tree

Hide file tree

Showing 13 changed files with 276 additions and 47 deletions.
diff --git a/energysim/execution/gpu_metrics.py b/energysim/execution/gpu_metrics.py
@@ -1,5 +1,8 @@
 from tabulate import tabulate
 
+from energysim.utils.scientific_format import scientific_format
+
+
 class GraphicsProcessingUnitMetrics:
     def __init__(self, name: str):
         self.name = name
@@ -39,10 +42,28 @@ def __init__(self, name: str):
         self.events = dict.fromkeys(self.keys, 0)
         self.energy = dict.fromkeys(self.keys, 0.0)
 
+        # machine attributes
+        self.core_clock_ghz: float = 0
+        self.memory_clock_ghz: float = 0
+        self.word_size: int = 0
+        self.cache_line_size: int = 0
+        self.memory_burst: int = 0
+        self.memory_channels: int = 0
+        self.channel_width: int = 0
+        # kernel attributes
+        self.threads_per_block: int = 0
+        self.blocks_per_grid: int = 0
         # performance metrics
-        self.TIPS = 0   # instructions per second
-        self.TOPS = 0   # floating point operations per second
-        self.MemGOPS = 0   # memory operations per second
+        self.elapsed_time: float = 0    # in seconds
+        self.instr_per_sec: float = 0   # instructions per second
+        self.flops_per_sec: float = 0   # floating point operations per second
+        self.memory_ops: int = 0
+        self.memory_clock_ns: float = 0  # memory clock cycle in nano-seconds
+        self.memops_per_sec: float = 0   # memory operations per second
+        self.read_data: float = 0 # memory read in MB
+        self.write_data: float = 0 # memory written in MB
+        self.memory_read_bw: float = 0  # memory read bandwidth in GB/s
+        self.memory_write_bw: float = 0  # memory write bandwidth in GB/s
 
     def __repr__(self):
         return f"GraphicsProcessingUnitMetrics(name='{self.name}', ...)"
@@ -139,4 +160,32 @@ def report(self):
 
         print(tabulate(data, headers="firstrow", floatfmt=".1f"))
 
+        print()
+        print(f'Machine Configuration')
+        print(f'Core clock      : {self.core_clock_ghz} GHz')
+        print(f'Memory clock    : {self.memory_clock_ghz} GHz')
+        print(f'Word size       : {self.word_size} bytes')
+        print(f'Cache line size : {self.cache_line_size} bytes')
+        print(f'Memory burst    : {self.memory_burst} bytes')
+        print(f'Memory channels : {self.memory_channels}')
+        print(f'Channel width   : {self.channel_width} bytes')
+
+        print()
+        print(f'Kernel Dispatch Configuration')
+        print(f'Threads per block : {self.threads_per_block}')
+        print(f'Blocks per grid   : {self.blocks_per_grid}')
+
+        print()
+        print()
+        print(f'Performance summary')
+        print(f'Elapsed time      : ' + scientific_format(self.elapsed_time, 'sec'))
+        print(f'IPS               : ' + scientific_format(self.instr_per_sec, 'IPS'))
+        print(f'FLOPS             : ' + scientific_format(self.flops_per_sec, 'FLOPS'))
+        print(f'Memory ops        : ' + scientific_format(self.memory_ops, 'memory ops'))
+        print(f'Memory clk        : ' + scientific_format(self.memory_clock_ns*1.0e-9, 'sec'))
+        print(f'Data Size read    : ' + scientific_format(self.read_data, 'Bytes'))
+        print(f'Data Size written : ' + scientific_format(self.write_data, 'Bytes'))
+        print(f'Memory ops        : ' + scientific_format(self.memops_per_sec, 'MemoryOps/sec'))
+        print(f'Memory Read       : ' + scientific_format(self.memory_read_bw, 'Bytes/sec'))
+        print(f'Memory Write      : ' + scientific_format(self.memory_write_bw, 'Bytes/sec'))
 
diff --git a/energysim/execution/spm_metrics.py b/energysim/execution/spm_metrics.py
@@ -1,5 +1,8 @@
 from tabulate import tabulate
 
+from energysim.utils.scientific_format import scientific_format
+
+
 class StoredProgramMachineMetrics:
     def __init__(self, name: str):
         self.name = name
@@ -38,10 +41,22 @@ def __init__(self, name: str):
         self.events = dict.fromkeys(self.keys, 0)
         self.energy = dict.fromkeys(self.keys, 0.0)
 
+        # machine attributes
+        self.core_clock_ghz: float = 0
+        self.memory_clock_ghz: float = 0
+        self.cache_line_size: int = 0
+        self.memory_burst: int = 0
         # performance metrics
-        self.TIPS = 0   # instructions per second
-        self.TOPS = 0   # floating point operations per second
-        self.MemGOPS = 0   # memory operations per second
+        self.elapsed_time: float = 0
+        self.instr_per_sec: float = 0   # instructions per second
+        self.flops_per_sec: float = 0   # floating point operations per second
+        self.memory_ops: int = 0
+        self.memory_clock_ns: float = 0  # memory clock cycle in nano-seconds
+        self.memops_per_sec: float = 0   # memory operations per second
+        self.read_data: float = 0 # memory read in MB
+        self.write_data: float = 0 # memory written in MB
+        self.memory_read_bw: float = 0  # memory read bandwidth in GB/s
+        self.memory_write_bw: float = 0  # memory write bandwidth in GB/s
 
     def __repr__(self):
         return f"StoredProgramMachineMetrics(name='{self.name}', ...)"
@@ -144,4 +159,23 @@ def report(self):
 
         print(tabulate(data, headers="firstrow", floatfmt=".1f"))
 
+        print()
+        print(f'Machine Configuration')
+        print(f'Cache line size : {self.cache_line_size} bytes')
+        print(f'Memory burst    : {self.memory_burst} bytes')
+        print(f'Core clock      : {self.core_clock_ghz} GHz')
+        print(f'Memory clock    : {self.memory_clock_ghz} GHz')
+
+        print()
+        print(f'Performance summary')
+        print(f'Elapsed time      : ' + scientific_format(self.elapsed_time, 'sec'))
+        print(f'IPS               : ' + scientific_format(self.instr_per_sec, 'IPS'))
+        print(f'FLOPS             : ' + scientific_format(self.flops_per_sec, 'FLOPS'))
+        print(f'Memory ops        : ' + scientific_format(self.memory_ops, 'memory ops'))
+        print(f'Memory clk        : ' + scientific_format(self.memory_clock_ns*1.0e-9, 'sec'))
+        print(f'Data Size read    : ' + scientific_format(self.read_data, 'Bytes'))
+        print(f'Data Size written : ' + scientific_format(self.write_data, 'Bytes'))
+        print(f'Memory ops        : ' + scientific_format(self.memops_per_sec, 'MemoryOps/sec'))
+        print(f'Memory Read       : ' + scientific_format(self.memory_read_bw, 'Bytes/sec'))
+        print(f'Memory Write      : ' + scientific_format(self.memory_write_bw, 'Bytes/sec'))
 
diff --git a/energysim/models/gpu_configuration.py b/energysim/models/gpu_configuration.py
@@ -1,11 +1,24 @@
 from energysim.models.design_category import DesignCategory
 
 class GraphicsProcessingUnitConfiguration:
-    def __init__(self, category: 'DesignCategory', core_clock_ghz: float, memory_clock_ghz: float, word_size_in_bits: int, memory_burst_size_in_bytes: int, threads_per_block: int, blocks_per_grid: int):
+    def __init__(self,
+                 category: 'DesignCategory',
+                 core_clock_ghz: float,
+                 memory_clock_ghz: float,
+                 word_size_in_bytes: int,
+                 cache_line_size_in_bytes: int,
+                 memory_burst_size_in_bytes: int,
+                 memory_channels: int,
+                 channel_width_in_bytes: int,
+                 threads_per_block: int,
+                 blocks_per_grid: int):
         # GPU attributes
         # structure
-        self.word_size: int = word_size_in_bits
+        self.word_size: int = word_size_in_bytes
+        self.cache_line_size: int = cache_line_size_in_bytes
         self.memory_burst_size: int = memory_burst_size_in_bytes
+        self.memory_channels: int = memory_channels
+        self.channel_width: int = channel_width_in_bytes
         # attributes
         self.category: DesignCategory = category
         self.core_clock: float = core_clock_ghz # GHz
@@ -24,7 +37,11 @@ def __str__(self):
         return f"""
 
         GPU Configuration:
+        - Cache line size:    {self.cache_line_size} bytes
         - Memory burst size:  {self.memory_burst_size} bytes
+        - Word size:          {self.word_size} bytes
+        - Memory channels:    {self.memory_channels}
+        - Channel width:       {self.channel_width} bytes
         
         - Design Category:    {self.category}
         - Core clock:         {self.core_clock} GHz

diff --git a/energysim/models/spm_configuration.py b/energysim/models/spm_configuration.py
@@ -1,11 +1,12 @@
 from energysim.models.design_category import DesignCategory
 
 class StoredProgramMachineConfiguration:
-    def __init__(self, category: 'DesignCategory', processor_clock_ghz: float, memory_clock_ghz: float, cache_line_size_in_bytes: int):
+    def __init__(self, category: 'DesignCategory', processor_clock_ghz: float, memory_clock_ghz: float, cache_line_size_in_bytes: int, memory_burst_size_in_bytes: int, word_size_in_bytes: int):
         # SPM attributes
         # structure
         self.cache_line_size: int = cache_line_size_in_bytes
         self.memory_burst_size: int = cache_line_size_in_bytes
+        self.word_size: int = word_size_in_bytes
         # attributes
         self.category: DesignCategory = category
         self.processor_clock: float = processor_clock_ghz # GHz
@@ -23,6 +24,7 @@ def __str__(self):
         SPM Configuration:
         - Cache line size:    {self.cache_line_size} bytes
         - Memory burst size:  {self.memory_burst_size} bytes
+        - Word size:          {self.word_size} bytes
         
         - Design Category:    {self.category}
         - Processor clock:    {self.processor_clock} GHz

diff --git a/energysim/operator/flat_matmul.py b/energysim/operator/flat_matmul.py
@@ -81,8 +81,8 @@ def flat_matmul_spm(M, N, K, attributes: 'StoredProgramMachineEnergy', config: '
     bandwidth = memory_clock * 8 * 2
     throughput = total_cache_lines_in * config.cache_line_size / bandwidth
     gops = 1.0 / throughput
-    spm_metrics.TIPS = gops / 100.0
-    spm_metrics.TOPS = gops / 1000.0
-    spm_metrics.MemGOPS = gops
+    spm_metrics.instr_per_sec = gops / 100.0
+    spm_metrics.flops_per_sec = gops / 1000.0
+    spm_metrics.memops_per_sec = gops
 
     return spm_metrics
diff --git a/energysim/operator/flat_matvec.py b/energysim/operator/flat_matvec.py
@@ -19,7 +19,7 @@ def flat_matvec_spm(rows, cols, attributes: 'StoredProgramMachineEnergy', config
 
     # instructions flow through the fetch/decode/dispatch part of the pipeline
     # nr of instructions per multiply-add is roughly 13
-    nr_of_instructions: int = fmas * 13
+    nr_of_instructions: int = fmas * 6
     spm_metrics.record('instruction', nr_of_instructions, attributes.instruction)
 
     # we need to read two inputs for each fma,
@@ -32,12 +32,16 @@ def flat_matvec_spm(rows, cols, attributes: 'StoredProgramMachineEnergy', config
     spm_metrics.record('register_write', register_write, attributes.register_write)
 
     # flat mv assumes we are streaming to the cache without reuse
-    cache_line_size = 32  # bytes
+    cache_line_size = config.cache_line_size  # bytes
+    memory_burst_size = config.memory_burst_size # bytes
     matrix_elements = rows * cols
     vector_elements = cols
     total_elements = matrix_elements + vector_elements
-    matrix_cache_lines: int = math.ceil(matrix_elements / cache_line_size)
-    vector_cache_lines: int = math.ceil(vector_elements / cache_line_size)
+    matrix_data_structure_size = matrix_elements * config.word_size
+    vector_data_structure_size = vector_elements * config.word_size
+    total_data_structure_size = total_elements * config.word_size
+    matrix_cache_lines: int = math.ceil(matrix_data_structure_size / cache_line_size)
+    vector_cache_lines: int = math.ceil(vector_data_structure_size / cache_line_size)
     total_cache_lines_in: int = matrix_cache_lines + vector_cache_lines
     total_cache_lines_out: int = vector_cache_lines
     total_cache_lines: int = (total_cache_lines_in + total_cache_lines_out)
@@ -75,17 +79,30 @@ def flat_matvec_spm(rows, cols, attributes: 'StoredProgramMachineEnergy', config
 
     # how long would it take to move the total number of data from and to the memory
     # a 64bit DDR DIMM needs 4 clocks to move a cacheline
-    total_latency = total_cache_lines * 4 * config.memory_cycle_ns
+    memory_ops = total_cache_lines
+    total_elapsed_time_in_sec = memory_ops * 4 * config.memory_cycle_ns * 1.0e-9
 
     # instruction throughput yielded
-    gips = spm_metrics.events['instruction'] / total_latency
-    gops = spm_metrics.events['execute'] / total_latency
-    memory_gops = total_cache_lines / total_latency
-
-    spm_metrics.TIPS = gips / 1000.0
-    spm_metrics.TOPS = gops / 1000.0
-    spm_metrics.MemGOPS = memory_gops
-
+    instr_per_sec = spm_metrics.events['instruction'] / total_elapsed_time_in_sec
+    flops_per_sec = spm_metrics.events['execute'] / total_elapsed_time_in_sec
+    memory_ops_per_second = total_cache_lines / total_elapsed_time_in_sec
+
+    spm_metrics.elapsed_time = total_elapsed_time_in_sec
+    spm_metrics.instr_per_sec = instr_per_sec
+    spm_metrics.flops_per_sec = flops_per_sec
+    spm_metrics.memory_ops = memory_ops
+    spm_metrics.memory_clock_ns = config.memory_cycle_ns
+    spm_metrics.read_data = total_cache_lines_in * cache_line_size
+    spm_metrics.write_data = total_cache_lines_out * cache_line_size
+    spm_metrics.memory_read_bw = spm_metrics.read_data / total_elapsed_time_in_sec
+    spm_metrics.memory_write_bw = spm_metrics.write_data / total_elapsed_time_in_sec
+    spm_metrics.memops_per_sec = memory_ops_per_second
+
+    # copy the machine attributes into the metrics data structure
+    spm_metrics.core_clock_ghz = config.processor_clock
+    spm_metrics.memory_clock_ghz = config.memory_clock
+    spm_metrics.cache_line_size = cache_line_size
+    spm_metrics.memory_burst = memory_burst_size
     return spm_metrics
 
 
@@ -120,6 +137,9 @@ def flat_matvec_gpu(rows, cols, attributes: 'GraphicsProcessingUnitEnergy', conf
     matrix_elements = rows * cols
     vector_elements = cols
     total_elements = matrix_elements + vector_elements
+    matrix_data_structure_size = matrix_elements * config.word_size
+    vector_data_structure_size = vector_elements * config.word_size
+    total_data_structure_size = total_elements * config.word_size
 
     # a decoded instruction is sent to all the ALUs via a Warp (NVIDIA) or Wavefront (AMD) scheduler
     # this is an energetic event and needs to be tracked
@@ -169,16 +189,32 @@ def flat_matvec_gpu(rows, cols, attributes: 'GraphicsProcessingUnitEnergy', conf
 
     # how long would it take to move the total number of data from and to the memory
     # a 64bit DDR DIMM needs 4 clocks to move a cacheline
-    total_memory_bursts = total_memory_read_bursts + total_memory_read_bursts
-    total_latency = total_memory_bursts * 4 * config.memory_cycle_ns
+    total_memory_ops = total_memory_read_bursts + total_memory_read_bursts
+    total_elapsed_time_in_sec = total_memory_ops * 4 * config.memory_cycle_ns * 1.0e-9
 
     # instruction throughput yielded
-    gips = gpu_metrics.events['instruction'] / total_latency
-    gops = gpu_metrics.events['execute'] / total_latency
-    memory_gops = total_memory_bursts / total_latency
-
-    gpu_metrics.TIPS = gips / 1000.0
-    gpu_metrics.TOPS = gops / 1000.0
-    gpu_metrics.MemGOPS = memory_gops
+    instr_per_sec = gpu_metrics.events['instruction'] / total_elapsed_time_in_sec
+    flops_per_sec = gpu_metrics.events['execute'] / total_elapsed_time_in_sec
+    memory_ops_per_second = total_memory_ops / total_elapsed_time_in_sec
+
+    gpu_metrics.elapsed_time = total_elapsed_time_in_sec
+    gpu_metrics.instr_per_sec = instr_per_sec
+    gpu_metrics.flops_per_sec = flops_per_sec
+    gpu_metrics.memory_ops = total_memory_ops
+    gpu_metrics.memory_clock_ns = config.memory_cycle_ns
+    gpu_metrics.read_data = matrix_data_structure_size
+    gpu_metrics.write_data = vector_data_structure_size
+    gpu_metrics.memory_read_bw = matrix_data_structure_size / total_elapsed_time_in_sec
+    gpu_metrics.memory_write_bw = vector_data_structure_size / total_elapsed_time_in_sec
+    gpu_metrics.memops_per_sec = memory_ops_per_second
+
+    # copy the machine attributes into the metrics data structure
+    gpu_metrics.core_clock_ghz = config.core_clock
+    gpu_metrics.memory_clock_ghz = config.memory_clock
+    gpu_metrics.word_size = config.word_size
+    gpu_metrics.cache_line_size = config.cache_line_size
+    gpu_metrics.memory_burst = config.memory_burst_size
+    gpu_metrics.memory_channels = config.memory_channels
+    gpu_metrics.channel_width = config.channel_width
 
     return gpu_metrics
diff --git a/energysim/utils/scientific_format.py b/energysim/utils/scientific_format.py
@@ -0,0 +1,46 @@
+import math
+
+
+def scientific_format(value: float, dimension: str) -> str:
+    """
+
+    Args:
+        value: floating-point value to convert
+
+    Returns: str
+
+    """
+    if value == 0.0:
+        return f"{value:7.3f} {dimension}"
+
+    value = math.fabs(value)
+    if value >= 1.0:
+        if 1.0e0 <= value < 1000.0:
+            return f"{value:7.3f} {dimension}"
+        elif 1.0e3 <= value < 1.0e6:
+            return f"{value/1.0e3:7.3f} k{dimension}"
+        elif 1.0e6 <= value < 1.0e9:
+            return f"{value/1.0e6:7.3f} M{dimension}"
+        elif 1.0e9 <= value < 1.0e12:
+            return f"{value/1.0e9:7.3f} G{dimension}"
+        elif 1.0e12 <= value < 1.0e15:
+            return f"{value/1.0e12:7.3f} T{dimension}"
+        elif 1.0e15 <= value < 1.0e18:
+            return f"{value/1.0e15:7.3f} P{dimension}"
+        elif 1.0e18 <= value < 1.0e21:
+            return f"{value/1.0e18:7.3f} E{dimension}"
+        else:
+            return f"{value / 1.0e24:7.3f} Z{dimension}"
+    else:
+        if 1.0e-3 <= value < 1.0e0:
+            return f"{value/1.0e-3:7.3f} m{dimension}"
+        elif 1.0e-6 <= value < 1.0e-3:
+            return f"{value/1.0e-6:7.3f} u{dimension}"
+        elif 1.0e-9 <= value < 1.0e-6:
+            return f"{value/1.0e-9:7.3f} n{dimension}"
+        elif 1.0e-12 <= value < 1.0e-9:
+            return f"{value/1.0e-12:7.3f} p{dimension}"
+        elif 1.0e-15 <= value < 1.0e-12:
+            return f"{value/1.0e-15:7.3f} f{dimension}"
+        else:
+            return f"{value/1.0e-18:7.3f} a{dimension}"
diff --git a/scripts/playground/gpu_matvec.py b/scripts/playground/gpu_matvec.py
@@ -38,14 +38,21 @@ def sample_gpu(process_node: str, config: 'GraphicsProcessingUnitConfiguration')
     memory_clock_ghz = 4.0
     word_size_in_bits = 32
     memory_burst_in_bytes = 64
-
+    cache_line_size_in_bytes = 64
+    word_size_in_bytes = 4   # 4 bytes for single precision, 2 bytes for half, and 1 byte for FP8
+    memory_burst_size_in_bytes = 64 # typically can be 32b, 64b, 128bytes
+    memory_channels = 4
+    channel_width_in_bytes = 8 # LPDDR tends to be 2 bytes, DDR and GDDR tend to be 8 bytes, HBM is 128 bytes
 
     config = GraphicsProcessingUnitConfiguration(
         category,
         core_clock_ghz,
         memory_clock_ghz,
-        word_size_in_bits,
-        memory_burst_in_bytes,
+        word_size_in_bytes,
+        cache_line_size_in_bytes,
+        memory_burst_size_in_bytes,
+        memory_channels,
+        channel_width_in_bytes,
         threads_per_block,
         blocks_per_grid
     )