Skip to content

Commit

Permalink
WIP: cross validation experiments for SPM and GPU performance
Browse files Browse the repository at this point in the history
  • Loading branch information
Ravenwater committed Dec 16, 2024
1 parent 39ea407 commit 1a21549
Show file tree
Hide file tree
Showing 13 changed files with 276 additions and 47 deletions.
55 changes: 52 additions & 3 deletions energysim/execution/gpu_metrics.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
from tabulate import tabulate

from energysim.utils.scientific_format import scientific_format


class GraphicsProcessingUnitMetrics:
def __init__(self, name: str):
self.name = name
Expand Down Expand Up @@ -39,10 +42,28 @@ def __init__(self, name: str):
self.events = dict.fromkeys(self.keys, 0)
self.energy = dict.fromkeys(self.keys, 0.0)

# machine attributes
self.core_clock_ghz: float = 0
self.memory_clock_ghz: float = 0
self.word_size: int = 0
self.cache_line_size: int = 0
self.memory_burst: int = 0
self.memory_channels: int = 0
self.channel_width: int = 0
# kernel attributes
self.threads_per_block: int = 0
self.blocks_per_grid: int = 0
# performance metrics
self.TIPS = 0 # instructions per second
self.TOPS = 0 # floating point operations per second
self.MemGOPS = 0 # memory operations per second
self.elapsed_time: float = 0 # in seconds
self.instr_per_sec: float = 0 # instructions per second
self.flops_per_sec: float = 0 # floating point operations per second
self.memory_ops: int = 0
self.memory_clock_ns: float = 0 # memory clock cycle in nano-seconds
self.memops_per_sec: float = 0 # memory operations per second
self.read_data: float = 0 # memory read in MB
self.write_data: float = 0 # memory written in MB
self.memory_read_bw: float = 0 # memory read bandwidth in GB/s
self.memory_write_bw: float = 0 # memory write bandwidth in GB/s

def __repr__(self):
return f"GraphicsProcessingUnitMetrics(name='{self.name}', ...)"
Expand Down Expand Up @@ -139,4 +160,32 @@ def report(self):

print(tabulate(data, headers="firstrow", floatfmt=".1f"))

print()
print(f'Machine Configuration')
print(f'Core clock : {self.core_clock_ghz} GHz')
print(f'Memory clock : {self.memory_clock_ghz} GHz')
print(f'Word size : {self.word_size} bytes')
print(f'Cache line size : {self.cache_line_size} bytes')
print(f'Memory burst : {self.memory_burst} bytes')
print(f'Memory channels : {self.memory_channels}')
print(f'Channel width : {self.channel_width} bytes')

print()
print(f'Kernel Dispatch Configuration')
print(f'Threads per block : {self.threads_per_block}')
print(f'Blocks per grid : {self.blocks_per_grid}')

print()
print()
print(f'Performance summary')
print(f'Elapsed time : ' + scientific_format(self.elapsed_time, 'sec'))
print(f'IPS : ' + scientific_format(self.instr_per_sec, 'IPS'))
print(f'FLOPS : ' + scientific_format(self.flops_per_sec, 'FLOPS'))
print(f'Memory ops : ' + scientific_format(self.memory_ops, 'memory ops'))
print(f'Memory clk : ' + scientific_format(self.memory_clock_ns*1.0e-9, 'sec'))
print(f'Data Size read : ' + scientific_format(self.read_data, 'Bytes'))
print(f'Data Size written : ' + scientific_format(self.write_data, 'Bytes'))
print(f'Memory ops : ' + scientific_format(self.memops_per_sec, 'MemoryOps/sec'))
print(f'Memory Read : ' + scientific_format(self.memory_read_bw, 'Bytes/sec'))
print(f'Memory Write : ' + scientific_format(self.memory_write_bw, 'Bytes/sec'))

40 changes: 37 additions & 3 deletions energysim/execution/spm_metrics.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
from tabulate import tabulate

from energysim.utils.scientific_format import scientific_format


class StoredProgramMachineMetrics:
def __init__(self, name: str):
self.name = name
Expand Down Expand Up @@ -38,10 +41,22 @@ def __init__(self, name: str):
self.events = dict.fromkeys(self.keys, 0)
self.energy = dict.fromkeys(self.keys, 0.0)

# machine attributes
self.core_clock_ghz: float = 0
self.memory_clock_ghz: float = 0
self.cache_line_size: int = 0
self.memory_burst: int = 0
# performance metrics
self.TIPS = 0 # instructions per second
self.TOPS = 0 # floating point operations per second
self.MemGOPS = 0 # memory operations per second
self.elapsed_time: float = 0
self.instr_per_sec: float = 0 # instructions per second
self.flops_per_sec: float = 0 # floating point operations per second
self.memory_ops: int = 0
self.memory_clock_ns: float = 0 # memory clock cycle in nano-seconds
self.memops_per_sec: float = 0 # memory operations per second
self.read_data: float = 0 # memory read in MB
self.write_data: float = 0 # memory written in MB
self.memory_read_bw: float = 0 # memory read bandwidth in GB/s
self.memory_write_bw: float = 0 # memory write bandwidth in GB/s

def __repr__(self):
return f"StoredProgramMachineMetrics(name='{self.name}', ...)"
Expand Down Expand Up @@ -144,4 +159,23 @@ def report(self):

print(tabulate(data, headers="firstrow", floatfmt=".1f"))

print()
print(f'Machine Configuration')
print(f'Cache line size : {self.cache_line_size} bytes')
print(f'Memory burst : {self.memory_burst} bytes')
print(f'Core clock : {self.core_clock_ghz} GHz')
print(f'Memory clock : {self.memory_clock_ghz} GHz')

print()
print(f'Performance summary')
print(f'Elapsed time : ' + scientific_format(self.elapsed_time, 'sec'))
print(f'IPS : ' + scientific_format(self.instr_per_sec, 'IPS'))
print(f'FLOPS : ' + scientific_format(self.flops_per_sec, 'FLOPS'))
print(f'Memory ops : ' + scientific_format(self.memory_ops, 'memory ops'))
print(f'Memory clk : ' + scientific_format(self.memory_clock_ns*1.0e-9, 'sec'))
print(f'Data Size read : ' + scientific_format(self.read_data, 'Bytes'))
print(f'Data Size written : ' + scientific_format(self.write_data, 'Bytes'))
print(f'Memory ops : ' + scientific_format(self.memops_per_sec, 'MemoryOps/sec'))
print(f'Memory Read : ' + scientific_format(self.memory_read_bw, 'Bytes/sec'))
print(f'Memory Write : ' + scientific_format(self.memory_write_bw, 'Bytes/sec'))

21 changes: 19 additions & 2 deletions energysim/models/gpu_configuration.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,24 @@
from energysim.models.design_category import DesignCategory

class GraphicsProcessingUnitConfiguration:
def __init__(self, category: 'DesignCategory', core_clock_ghz: float, memory_clock_ghz: float, word_size_in_bits: int, memory_burst_size_in_bytes: int, threads_per_block: int, blocks_per_grid: int):
def __init__(self,
category: 'DesignCategory',
core_clock_ghz: float,
memory_clock_ghz: float,
word_size_in_bytes: int,
cache_line_size_in_bytes: int,
memory_burst_size_in_bytes: int,
memory_channels: int,
channel_width_in_bytes: int,
threads_per_block: int,
blocks_per_grid: int):
# GPU attributes
# structure
self.word_size: int = word_size_in_bits
self.word_size: int = word_size_in_bytes
self.cache_line_size: int = cache_line_size_in_bytes
self.memory_burst_size: int = memory_burst_size_in_bytes
self.memory_channels: int = memory_channels
self.channel_width: int = channel_width_in_bytes
# attributes
self.category: DesignCategory = category
self.core_clock: float = core_clock_ghz # GHz
Expand All @@ -24,7 +37,11 @@ def __str__(self):
return f"""
GPU Configuration:
- Cache line size: {self.cache_line_size} bytes
- Memory burst size: {self.memory_burst_size} bytes
- Word size: {self.word_size} bytes
- Memory channels: {self.memory_channels}
- Channel width: {self.channel_width} bytes
- Design Category: {self.category}
- Core clock: {self.core_clock} GHz
Expand Down
4 changes: 3 additions & 1 deletion energysim/models/spm_configuration.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
from energysim.models.design_category import DesignCategory

class StoredProgramMachineConfiguration:
def __init__(self, category: 'DesignCategory', processor_clock_ghz: float, memory_clock_ghz: float, cache_line_size_in_bytes: int):
def __init__(self, category: 'DesignCategory', processor_clock_ghz: float, memory_clock_ghz: float, cache_line_size_in_bytes: int, memory_burst_size_in_bytes: int, word_size_in_bytes: int):
# SPM attributes
# structure
self.cache_line_size: int = cache_line_size_in_bytes
self.memory_burst_size: int = cache_line_size_in_bytes
self.word_size: int = word_size_in_bytes
# attributes
self.category: DesignCategory = category
self.processor_clock: float = processor_clock_ghz # GHz
Expand All @@ -23,6 +24,7 @@ def __str__(self):
SPM Configuration:
- Cache line size: {self.cache_line_size} bytes
- Memory burst size: {self.memory_burst_size} bytes
- Word size: {self.word_size} bytes
- Design Category: {self.category}
- Processor clock: {self.processor_clock} GHz
Expand Down
6 changes: 3 additions & 3 deletions energysim/operator/flat_matmul.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,8 +81,8 @@ def flat_matmul_spm(M, N, K, attributes: 'StoredProgramMachineEnergy', config: '
bandwidth = memory_clock * 8 * 2
throughput = total_cache_lines_in * config.cache_line_size / bandwidth
gops = 1.0 / throughput
spm_metrics.TIPS = gops / 100.0
spm_metrics.TOPS = gops / 1000.0
spm_metrics.MemGOPS = gops
spm_metrics.instr_per_sec = gops / 100.0
spm_metrics.flops_per_sec = gops / 1000.0
spm_metrics.memops_per_sec = gops

return spm_metrics
80 changes: 58 additions & 22 deletions energysim/operator/flat_matvec.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def flat_matvec_spm(rows, cols, attributes: 'StoredProgramMachineEnergy', config

# instructions flow through the fetch/decode/dispatch part of the pipeline
# nr of instructions per multiply-add is roughly 13
nr_of_instructions: int = fmas * 13
nr_of_instructions: int = fmas * 6
spm_metrics.record('instruction', nr_of_instructions, attributes.instruction)

# we need to read two inputs for each fma,
Expand All @@ -32,12 +32,16 @@ def flat_matvec_spm(rows, cols, attributes: 'StoredProgramMachineEnergy', config
spm_metrics.record('register_write', register_write, attributes.register_write)

# flat mv assumes we are streaming to the cache without reuse
cache_line_size = 32 # bytes
cache_line_size = config.cache_line_size # bytes
memory_burst_size = config.memory_burst_size # bytes
matrix_elements = rows * cols
vector_elements = cols
total_elements = matrix_elements + vector_elements
matrix_cache_lines: int = math.ceil(matrix_elements / cache_line_size)
vector_cache_lines: int = math.ceil(vector_elements / cache_line_size)
matrix_data_structure_size = matrix_elements * config.word_size
vector_data_structure_size = vector_elements * config.word_size
total_data_structure_size = total_elements * config.word_size
matrix_cache_lines: int = math.ceil(matrix_data_structure_size / cache_line_size)
vector_cache_lines: int = math.ceil(vector_data_structure_size / cache_line_size)
total_cache_lines_in: int = matrix_cache_lines + vector_cache_lines
total_cache_lines_out: int = vector_cache_lines
total_cache_lines: int = (total_cache_lines_in + total_cache_lines_out)
Expand Down Expand Up @@ -75,17 +79,30 @@ def flat_matvec_spm(rows, cols, attributes: 'StoredProgramMachineEnergy', config

# how long would it take to move the total number of data from and to the memory
# a 64bit DDR DIMM needs 4 clocks to move a cacheline
total_latency = total_cache_lines * 4 * config.memory_cycle_ns
memory_ops = total_cache_lines
total_elapsed_time_in_sec = memory_ops * 4 * config.memory_cycle_ns * 1.0e-9

# instruction throughput yielded
gips = spm_metrics.events['instruction'] / total_latency
gops = spm_metrics.events['execute'] / total_latency
memory_gops = total_cache_lines / total_latency

spm_metrics.TIPS = gips / 1000.0
spm_metrics.TOPS = gops / 1000.0
spm_metrics.MemGOPS = memory_gops

instr_per_sec = spm_metrics.events['instruction'] / total_elapsed_time_in_sec
flops_per_sec = spm_metrics.events['execute'] / total_elapsed_time_in_sec
memory_ops_per_second = total_cache_lines / total_elapsed_time_in_sec

spm_metrics.elapsed_time = total_elapsed_time_in_sec
spm_metrics.instr_per_sec = instr_per_sec
spm_metrics.flops_per_sec = flops_per_sec
spm_metrics.memory_ops = memory_ops
spm_metrics.memory_clock_ns = config.memory_cycle_ns
spm_metrics.read_data = total_cache_lines_in * cache_line_size
spm_metrics.write_data = total_cache_lines_out * cache_line_size
spm_metrics.memory_read_bw = spm_metrics.read_data / total_elapsed_time_in_sec
spm_metrics.memory_write_bw = spm_metrics.write_data / total_elapsed_time_in_sec
spm_metrics.memops_per_sec = memory_ops_per_second

# copy the machine attributes into the metrics data structure
spm_metrics.core_clock_ghz = config.processor_clock
spm_metrics.memory_clock_ghz = config.memory_clock
spm_metrics.cache_line_size = cache_line_size
spm_metrics.memory_burst = memory_burst_size
return spm_metrics


Expand Down Expand Up @@ -120,6 +137,9 @@ def flat_matvec_gpu(rows, cols, attributes: 'GraphicsProcessingUnitEnergy', conf
matrix_elements = rows * cols
vector_elements = cols
total_elements = matrix_elements + vector_elements
matrix_data_structure_size = matrix_elements * config.word_size
vector_data_structure_size = vector_elements * config.word_size
total_data_structure_size = total_elements * config.word_size

# a decoded instruction is sent to all the ALUs via a Warp (NVIDIA) or Wavefront (AMD) scheduler
# this is an energetic event and needs to be tracked
Expand Down Expand Up @@ -169,16 +189,32 @@ def flat_matvec_gpu(rows, cols, attributes: 'GraphicsProcessingUnitEnergy', conf

# how long would it take to move the total number of data from and to the memory
# a 64bit DDR DIMM needs 4 clocks to move a cacheline
total_memory_bursts = total_memory_read_bursts + total_memory_read_bursts
total_latency = total_memory_bursts * 4 * config.memory_cycle_ns
total_memory_ops = total_memory_read_bursts + total_memory_read_bursts
total_elapsed_time_in_sec = total_memory_ops * 4 * config.memory_cycle_ns * 1.0e-9

# instruction throughput yielded
gips = gpu_metrics.events['instruction'] / total_latency
gops = gpu_metrics.events['execute'] / total_latency
memory_gops = total_memory_bursts / total_latency

gpu_metrics.TIPS = gips / 1000.0
gpu_metrics.TOPS = gops / 1000.0
gpu_metrics.MemGOPS = memory_gops
instr_per_sec = gpu_metrics.events['instruction'] / total_elapsed_time_in_sec
flops_per_sec = gpu_metrics.events['execute'] / total_elapsed_time_in_sec
memory_ops_per_second = total_memory_ops / total_elapsed_time_in_sec

gpu_metrics.elapsed_time = total_elapsed_time_in_sec
gpu_metrics.instr_per_sec = instr_per_sec
gpu_metrics.flops_per_sec = flops_per_sec
gpu_metrics.memory_ops = total_memory_ops
gpu_metrics.memory_clock_ns = config.memory_cycle_ns
gpu_metrics.read_data = matrix_data_structure_size
gpu_metrics.write_data = vector_data_structure_size
gpu_metrics.memory_read_bw = matrix_data_structure_size / total_elapsed_time_in_sec
gpu_metrics.memory_write_bw = vector_data_structure_size / total_elapsed_time_in_sec
gpu_metrics.memops_per_sec = memory_ops_per_second

# copy the machine attributes into the metrics data structure
gpu_metrics.core_clock_ghz = config.core_clock
gpu_metrics.memory_clock_ghz = config.memory_clock
gpu_metrics.word_size = config.word_size
gpu_metrics.cache_line_size = config.cache_line_size
gpu_metrics.memory_burst = config.memory_burst_size
gpu_metrics.memory_channels = config.memory_channels
gpu_metrics.channel_width = config.channel_width

return gpu_metrics
46 changes: 46 additions & 0 deletions energysim/utils/scientific_format.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import math


def scientific_format(value: float, dimension: str) -> str:
"""
Args:
value: floating-point value to convert
Returns: str
"""
if value == 0.0:
return f"{value:7.3f} {dimension}"

value = math.fabs(value)
if value >= 1.0:
if 1.0e0 <= value < 1000.0:
return f"{value:7.3f} {dimension}"
elif 1.0e3 <= value < 1.0e6:
return f"{value/1.0e3:7.3f} k{dimension}"
elif 1.0e6 <= value < 1.0e9:
return f"{value/1.0e6:7.3f} M{dimension}"
elif 1.0e9 <= value < 1.0e12:
return f"{value/1.0e9:7.3f} G{dimension}"
elif 1.0e12 <= value < 1.0e15:
return f"{value/1.0e12:7.3f} T{dimension}"
elif 1.0e15 <= value < 1.0e18:
return f"{value/1.0e15:7.3f} P{dimension}"
elif 1.0e18 <= value < 1.0e21:
return f"{value/1.0e18:7.3f} E{dimension}"
else:
return f"{value / 1.0e24:7.3f} Z{dimension}"
else:
if 1.0e-3 <= value < 1.0e0:
return f"{value/1.0e-3:7.3f} m{dimension}"
elif 1.0e-6 <= value < 1.0e-3:
return f"{value/1.0e-6:7.3f} u{dimension}"
elif 1.0e-9 <= value < 1.0e-6:
return f"{value/1.0e-9:7.3f} n{dimension}"
elif 1.0e-12 <= value < 1.0e-9:
return f"{value/1.0e-12:7.3f} p{dimension}"
elif 1.0e-15 <= value < 1.0e-12:
return f"{value/1.0e-15:7.3f} f{dimension}"
else:
return f"{value/1.0e-18:7.3f} a{dimension}"
13 changes: 10 additions & 3 deletions scripts/playground/gpu_matvec.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,14 +38,21 @@ def sample_gpu(process_node: str, config: 'GraphicsProcessingUnitConfiguration')
memory_clock_ghz = 4.0
word_size_in_bits = 32
memory_burst_in_bytes = 64

cache_line_size_in_bytes = 64
word_size_in_bytes = 4 # 4 bytes for single precision, 2 bytes for half, and 1 byte for FP8
memory_burst_size_in_bytes = 64 # typically can be 32b, 64b, 128bytes
memory_channels = 4
channel_width_in_bytes = 8 # LPDDR tends to be 2 bytes, DDR and GDDR tend to be 8 bytes, HBM is 128 bytes

config = GraphicsProcessingUnitConfiguration(
category,
core_clock_ghz,
memory_clock_ghz,
word_size_in_bits,
memory_burst_in_bytes,
word_size_in_bytes,
cache_line_size_in_bytes,
memory_burst_size_in_bytes,
memory_channels,
channel_width_in_bytes,
threads_per_block,
blocks_per_grid
)
Expand Down
Loading

0 comments on commit 1a21549

Please sign in to comment.