diff --git a/energysim/execution/gpu_metrics.py b/energysim/execution/gpu_metrics.py index 58d1a66..ca1b040 100644 --- a/energysim/execution/gpu_metrics.py +++ b/energysim/execution/gpu_metrics.py @@ -1,5 +1,8 @@ from tabulate import tabulate +from energysim.utils.scientific_format import scientific_format + + class GraphicsProcessingUnitMetrics: def __init__(self, name: str): self.name = name @@ -39,10 +42,28 @@ def __init__(self, name: str): self.events = dict.fromkeys(self.keys, 0) self.energy = dict.fromkeys(self.keys, 0.0) + # machine attributes + self.core_clock_ghz: float = 0 + self.memory_clock_ghz: float = 0 + self.word_size: int = 0 + self.cache_line_size: int = 0 + self.memory_burst: int = 0 + self.memory_channels: int = 0 + self.channel_width: int = 0 + # kernel attributes + self.threads_per_block: int = 0 + self.blocks_per_grid: int = 0 # performance metrics - self.TIPS = 0 # instructions per second - self.TOPS = 0 # floating point operations per second - self.MemGOPS = 0 # memory operations per second + self.elapsed_time: float = 0 # in seconds + self.instr_per_sec: float = 0 # instructions per second + self.flops_per_sec: float = 0 # floating point operations per second + self.memory_ops: int = 0 + self.memory_clock_ns: float = 0 # memory clock cycle in nano-seconds + self.memops_per_sec: float = 0 # memory operations per second + self.read_data: float = 0 # memory read in MB + self.write_data: float = 0 # memory written in MB + self.memory_read_bw: float = 0 # memory read bandwidth in GB/s + self.memory_write_bw: float = 0 # memory write bandwidth in GB/s def __repr__(self): return f"GraphicsProcessingUnitMetrics(name='{self.name}', ...)" @@ -139,4 +160,32 @@ def report(self): print(tabulate(data, headers="firstrow", floatfmt=".1f")) + print() + print(f'Machine Configuration') + print(f'Core clock : {self.core_clock_ghz} GHz') + print(f'Memory clock : {self.memory_clock_ghz} GHz') + print(f'Word size : {self.word_size} bytes') + print(f'Cache line size : {self.cache_line_size} bytes') + print(f'Memory burst : {self.memory_burst} bytes') + print(f'Memory channels : {self.memory_channels}') + print(f'Channel width : {self.channel_width} bytes') + + print() + print(f'Kernel Dispatch Configuration') + print(f'Threads per block : {self.threads_per_block}') + print(f'Blocks per grid : {self.blocks_per_grid}') + + print() + print() + print(f'Performance summary') + print(f'Elapsed time : ' + scientific_format(self.elapsed_time, 'sec')) + print(f'IPS : ' + scientific_format(self.instr_per_sec, 'IPS')) + print(f'FLOPS : ' + scientific_format(self.flops_per_sec, 'FLOPS')) + print(f'Memory ops : ' + scientific_format(self.memory_ops, 'memory ops')) + print(f'Memory clk : ' + scientific_format(self.memory_clock_ns*1.0e-9, 'sec')) + print(f'Data Size read : ' + scientific_format(self.read_data, 'Bytes')) + print(f'Data Size written : ' + scientific_format(self.write_data, 'Bytes')) + print(f'Memory ops : ' + scientific_format(self.memops_per_sec, 'MemoryOps/sec')) + print(f'Memory Read : ' + scientific_format(self.memory_read_bw, 'Bytes/sec')) + print(f'Memory Write : ' + scientific_format(self.memory_write_bw, 'Bytes/sec')) diff --git a/energysim/execution/spm_metrics.py b/energysim/execution/spm_metrics.py index 0d78eb1..fb665d5 100644 --- a/energysim/execution/spm_metrics.py +++ b/energysim/execution/spm_metrics.py @@ -1,5 +1,8 @@ from tabulate import tabulate +from energysim.utils.scientific_format import scientific_format + + class StoredProgramMachineMetrics: def __init__(self, name: str): self.name = name @@ -38,10 +41,22 @@ def __init__(self, name: str): self.events = dict.fromkeys(self.keys, 0) self.energy = dict.fromkeys(self.keys, 0.0) + # machine attributes + self.core_clock_ghz: float = 0 + self.memory_clock_ghz: float = 0 + self.cache_line_size: int = 0 + self.memory_burst: int = 0 # performance metrics - self.TIPS = 0 # instructions per second - self.TOPS = 0 # floating point operations per second - self.MemGOPS = 0 # memory operations per second + self.elapsed_time: float = 0 + self.instr_per_sec: float = 0 # instructions per second + self.flops_per_sec: float = 0 # floating point operations per second + self.memory_ops: int = 0 + self.memory_clock_ns: float = 0 # memory clock cycle in nano-seconds + self.memops_per_sec: float = 0 # memory operations per second + self.read_data: float = 0 # memory read in MB + self.write_data: float = 0 # memory written in MB + self.memory_read_bw: float = 0 # memory read bandwidth in GB/s + self.memory_write_bw: float = 0 # memory write bandwidth in GB/s def __repr__(self): return f"StoredProgramMachineMetrics(name='{self.name}', ...)" @@ -144,4 +159,23 @@ def report(self): print(tabulate(data, headers="firstrow", floatfmt=".1f")) + print() + print(f'Machine Configuration') + print(f'Cache line size : {self.cache_line_size} bytes') + print(f'Memory burst : {self.memory_burst} bytes') + print(f'Core clock : {self.core_clock_ghz} GHz') + print(f'Memory clock : {self.memory_clock_ghz} GHz') + + print() + print(f'Performance summary') + print(f'Elapsed time : ' + scientific_format(self.elapsed_time, 'sec')) + print(f'IPS : ' + scientific_format(self.instr_per_sec, 'IPS')) + print(f'FLOPS : ' + scientific_format(self.flops_per_sec, 'FLOPS')) + print(f'Memory ops : ' + scientific_format(self.memory_ops, 'memory ops')) + print(f'Memory clk : ' + scientific_format(self.memory_clock_ns*1.0e-9, 'sec')) + print(f'Data Size read : ' + scientific_format(self.read_data, 'Bytes')) + print(f'Data Size written : ' + scientific_format(self.write_data, 'Bytes')) + print(f'Memory ops : ' + scientific_format(self.memops_per_sec, 'MemoryOps/sec')) + print(f'Memory Read : ' + scientific_format(self.memory_read_bw, 'Bytes/sec')) + print(f'Memory Write : ' + scientific_format(self.memory_write_bw, 'Bytes/sec')) diff --git a/energysim/models/gpu_configuration.py b/energysim/models/gpu_configuration.py index 6fe566a..aad8e37 100644 --- a/energysim/models/gpu_configuration.py +++ b/energysim/models/gpu_configuration.py @@ -1,11 +1,24 @@ from energysim.models.design_category import DesignCategory class GraphicsProcessingUnitConfiguration: - def __init__(self, category: 'DesignCategory', core_clock_ghz: float, memory_clock_ghz: float, word_size_in_bits: int, memory_burst_size_in_bytes: int, threads_per_block: int, blocks_per_grid: int): + def __init__(self, + category: 'DesignCategory', + core_clock_ghz: float, + memory_clock_ghz: float, + word_size_in_bytes: int, + cache_line_size_in_bytes: int, + memory_burst_size_in_bytes: int, + memory_channels: int, + channel_width_in_bytes: int, + threads_per_block: int, + blocks_per_grid: int): # GPU attributes # structure - self.word_size: int = word_size_in_bits + self.word_size: int = word_size_in_bytes + self.cache_line_size: int = cache_line_size_in_bytes self.memory_burst_size: int = memory_burst_size_in_bytes + self.memory_channels: int = memory_channels + self.channel_width: int = channel_width_in_bytes # attributes self.category: DesignCategory = category self.core_clock: float = core_clock_ghz # GHz @@ -24,7 +37,11 @@ def __str__(self): return f""" GPU Configuration: + - Cache line size: {self.cache_line_size} bytes - Memory burst size: {self.memory_burst_size} bytes + - Word size: {self.word_size} bytes + - Memory channels: {self.memory_channels} + - Channel width: {self.channel_width} bytes - Design Category: {self.category} - Core clock: {self.core_clock} GHz diff --git a/energysim/models/spm_configuration.py b/energysim/models/spm_configuration.py index 9803da6..b96efbc 100644 --- a/energysim/models/spm_configuration.py +++ b/energysim/models/spm_configuration.py @@ -1,11 +1,12 @@ from energysim.models.design_category import DesignCategory class StoredProgramMachineConfiguration: - def __init__(self, category: 'DesignCategory', processor_clock_ghz: float, memory_clock_ghz: float, cache_line_size_in_bytes: int): + def __init__(self, category: 'DesignCategory', processor_clock_ghz: float, memory_clock_ghz: float, cache_line_size_in_bytes: int, memory_burst_size_in_bytes: int, word_size_in_bytes: int): # SPM attributes # structure self.cache_line_size: int = cache_line_size_in_bytes self.memory_burst_size: int = cache_line_size_in_bytes + self.word_size: int = word_size_in_bytes # attributes self.category: DesignCategory = category self.processor_clock: float = processor_clock_ghz # GHz @@ -23,6 +24,7 @@ def __str__(self): SPM Configuration: - Cache line size: {self.cache_line_size} bytes - Memory burst size: {self.memory_burst_size} bytes + - Word size: {self.word_size} bytes - Design Category: {self.category} - Processor clock: {self.processor_clock} GHz diff --git a/energysim/operator/flat_matmul.py b/energysim/operator/flat_matmul.py index 649bf37..3ac0ee2 100644 --- a/energysim/operator/flat_matmul.py +++ b/energysim/operator/flat_matmul.py @@ -81,8 +81,8 @@ def flat_matmul_spm(M, N, K, attributes: 'StoredProgramMachineEnergy', config: ' bandwidth = memory_clock * 8 * 2 throughput = total_cache_lines_in * config.cache_line_size / bandwidth gops = 1.0 / throughput - spm_metrics.TIPS = gops / 100.0 - spm_metrics.TOPS = gops / 1000.0 - spm_metrics.MemGOPS = gops + spm_metrics.instr_per_sec = gops / 100.0 + spm_metrics.flops_per_sec = gops / 1000.0 + spm_metrics.memops_per_sec = gops return spm_metrics diff --git a/energysim/operator/flat_matvec.py b/energysim/operator/flat_matvec.py index 85e510b..27a5a0d 100644 --- a/energysim/operator/flat_matvec.py +++ b/energysim/operator/flat_matvec.py @@ -19,7 +19,7 @@ def flat_matvec_spm(rows, cols, attributes: 'StoredProgramMachineEnergy', config # instructions flow through the fetch/decode/dispatch part of the pipeline # nr of instructions per multiply-add is roughly 13 - nr_of_instructions: int = fmas * 13 + nr_of_instructions: int = fmas * 6 spm_metrics.record('instruction', nr_of_instructions, attributes.instruction) # we need to read two inputs for each fma, @@ -32,12 +32,16 @@ def flat_matvec_spm(rows, cols, attributes: 'StoredProgramMachineEnergy', config spm_metrics.record('register_write', register_write, attributes.register_write) # flat mv assumes we are streaming to the cache without reuse - cache_line_size = 32 # bytes + cache_line_size = config.cache_line_size # bytes + memory_burst_size = config.memory_burst_size # bytes matrix_elements = rows * cols vector_elements = cols total_elements = matrix_elements + vector_elements - matrix_cache_lines: int = math.ceil(matrix_elements / cache_line_size) - vector_cache_lines: int = math.ceil(vector_elements / cache_line_size) + matrix_data_structure_size = matrix_elements * config.word_size + vector_data_structure_size = vector_elements * config.word_size + total_data_structure_size = total_elements * config.word_size + matrix_cache_lines: int = math.ceil(matrix_data_structure_size / cache_line_size) + vector_cache_lines: int = math.ceil(vector_data_structure_size / cache_line_size) total_cache_lines_in: int = matrix_cache_lines + vector_cache_lines total_cache_lines_out: int = vector_cache_lines total_cache_lines: int = (total_cache_lines_in + total_cache_lines_out) @@ -75,17 +79,30 @@ def flat_matvec_spm(rows, cols, attributes: 'StoredProgramMachineEnergy', config # how long would it take to move the total number of data from and to the memory # a 64bit DDR DIMM needs 4 clocks to move a cacheline - total_latency = total_cache_lines * 4 * config.memory_cycle_ns + memory_ops = total_cache_lines + total_elapsed_time_in_sec = memory_ops * 4 * config.memory_cycle_ns * 1.0e-9 # instruction throughput yielded - gips = spm_metrics.events['instruction'] / total_latency - gops = spm_metrics.events['execute'] / total_latency - memory_gops = total_cache_lines / total_latency - - spm_metrics.TIPS = gips / 1000.0 - spm_metrics.TOPS = gops / 1000.0 - spm_metrics.MemGOPS = memory_gops - + instr_per_sec = spm_metrics.events['instruction'] / total_elapsed_time_in_sec + flops_per_sec = spm_metrics.events['execute'] / total_elapsed_time_in_sec + memory_ops_per_second = total_cache_lines / total_elapsed_time_in_sec + + spm_metrics.elapsed_time = total_elapsed_time_in_sec + spm_metrics.instr_per_sec = instr_per_sec + spm_metrics.flops_per_sec = flops_per_sec + spm_metrics.memory_ops = memory_ops + spm_metrics.memory_clock_ns = config.memory_cycle_ns + spm_metrics.read_data = total_cache_lines_in * cache_line_size + spm_metrics.write_data = total_cache_lines_out * cache_line_size + spm_metrics.memory_read_bw = spm_metrics.read_data / total_elapsed_time_in_sec + spm_metrics.memory_write_bw = spm_metrics.write_data / total_elapsed_time_in_sec + spm_metrics.memops_per_sec = memory_ops_per_second + + # copy the machine attributes into the metrics data structure + spm_metrics.core_clock_ghz = config.processor_clock + spm_metrics.memory_clock_ghz = config.memory_clock + spm_metrics.cache_line_size = cache_line_size + spm_metrics.memory_burst = memory_burst_size return spm_metrics @@ -120,6 +137,9 @@ def flat_matvec_gpu(rows, cols, attributes: 'GraphicsProcessingUnitEnergy', conf matrix_elements = rows * cols vector_elements = cols total_elements = matrix_elements + vector_elements + matrix_data_structure_size = matrix_elements * config.word_size + vector_data_structure_size = vector_elements * config.word_size + total_data_structure_size = total_elements * config.word_size # a decoded instruction is sent to all the ALUs via a Warp (NVIDIA) or Wavefront (AMD) scheduler # this is an energetic event and needs to be tracked @@ -169,16 +189,32 @@ def flat_matvec_gpu(rows, cols, attributes: 'GraphicsProcessingUnitEnergy', conf # how long would it take to move the total number of data from and to the memory # a 64bit DDR DIMM needs 4 clocks to move a cacheline - total_memory_bursts = total_memory_read_bursts + total_memory_read_bursts - total_latency = total_memory_bursts * 4 * config.memory_cycle_ns + total_memory_ops = total_memory_read_bursts + total_memory_read_bursts + total_elapsed_time_in_sec = total_memory_ops * 4 * config.memory_cycle_ns * 1.0e-9 # instruction throughput yielded - gips = gpu_metrics.events['instruction'] / total_latency - gops = gpu_metrics.events['execute'] / total_latency - memory_gops = total_memory_bursts / total_latency - - gpu_metrics.TIPS = gips / 1000.0 - gpu_metrics.TOPS = gops / 1000.0 - gpu_metrics.MemGOPS = memory_gops + instr_per_sec = gpu_metrics.events['instruction'] / total_elapsed_time_in_sec + flops_per_sec = gpu_metrics.events['execute'] / total_elapsed_time_in_sec + memory_ops_per_second = total_memory_ops / total_elapsed_time_in_sec + + gpu_metrics.elapsed_time = total_elapsed_time_in_sec + gpu_metrics.instr_per_sec = instr_per_sec + gpu_metrics.flops_per_sec = flops_per_sec + gpu_metrics.memory_ops = total_memory_ops + gpu_metrics.memory_clock_ns = config.memory_cycle_ns + gpu_metrics.read_data = matrix_data_structure_size + gpu_metrics.write_data = vector_data_structure_size + gpu_metrics.memory_read_bw = matrix_data_structure_size / total_elapsed_time_in_sec + gpu_metrics.memory_write_bw = vector_data_structure_size / total_elapsed_time_in_sec + gpu_metrics.memops_per_sec = memory_ops_per_second + + # copy the machine attributes into the metrics data structure + gpu_metrics.core_clock_ghz = config.core_clock + gpu_metrics.memory_clock_ghz = config.memory_clock + gpu_metrics.word_size = config.word_size + gpu_metrics.cache_line_size = config.cache_line_size + gpu_metrics.memory_burst = config.memory_burst_size + gpu_metrics.memory_channels = config.memory_channels + gpu_metrics.channel_width = config.channel_width return gpu_metrics diff --git a/energysim/utils/scientific_format.py b/energysim/utils/scientific_format.py new file mode 100644 index 0000000..f6982e9 --- /dev/null +++ b/energysim/utils/scientific_format.py @@ -0,0 +1,46 @@ +import math + + +def scientific_format(value: float, dimension: str) -> str: + """ + + Args: + value: floating-point value to convert + + Returns: str + + """ + if value == 0.0: + return f"{value:7.3f} {dimension}" + + value = math.fabs(value) + if value >= 1.0: + if 1.0e0 <= value < 1000.0: + return f"{value:7.3f} {dimension}" + elif 1.0e3 <= value < 1.0e6: + return f"{value/1.0e3:7.3f} k{dimension}" + elif 1.0e6 <= value < 1.0e9: + return f"{value/1.0e6:7.3f} M{dimension}" + elif 1.0e9 <= value < 1.0e12: + return f"{value/1.0e9:7.3f} G{dimension}" + elif 1.0e12 <= value < 1.0e15: + return f"{value/1.0e12:7.3f} T{dimension}" + elif 1.0e15 <= value < 1.0e18: + return f"{value/1.0e15:7.3f} P{dimension}" + elif 1.0e18 <= value < 1.0e21: + return f"{value/1.0e18:7.3f} E{dimension}" + else: + return f"{value / 1.0e24:7.3f} Z{dimension}" + else: + if 1.0e-3 <= value < 1.0e0: + return f"{value/1.0e-3:7.3f} m{dimension}" + elif 1.0e-6 <= value < 1.0e-3: + return f"{value/1.0e-6:7.3f} u{dimension}" + elif 1.0e-9 <= value < 1.0e-6: + return f"{value/1.0e-9:7.3f} n{dimension}" + elif 1.0e-12 <= value < 1.0e-9: + return f"{value/1.0e-12:7.3f} p{dimension}" + elif 1.0e-15 <= value < 1.0e-12: + return f"{value/1.0e-15:7.3f} f{dimension}" + else: + return f"{value/1.0e-18:7.3f} a{dimension}" \ No newline at end of file diff --git a/scripts/playground/gpu_matvec.py b/scripts/playground/gpu_matvec.py index 578aff9..d68183f 100644 --- a/scripts/playground/gpu_matvec.py +++ b/scripts/playground/gpu_matvec.py @@ -38,14 +38,21 @@ def sample_gpu(process_node: str, config: 'GraphicsProcessingUnitConfiguration') memory_clock_ghz = 4.0 word_size_in_bits = 32 memory_burst_in_bytes = 64 - + cache_line_size_in_bytes = 64 + word_size_in_bytes = 4 # 4 bytes for single precision, 2 bytes for half, and 1 byte for FP8 + memory_burst_size_in_bytes = 64 # typically can be 32b, 64b, 128bytes + memory_channels = 4 + channel_width_in_bytes = 8 # LPDDR tends to be 2 bytes, DDR and GDDR tend to be 8 bytes, HBM is 128 bytes config = GraphicsProcessingUnitConfiguration( category, core_clock_ghz, memory_clock_ghz, - word_size_in_bits, - memory_burst_in_bytes, + word_size_in_bytes, + cache_line_size_in_bytes, + memory_burst_size_in_bytes, + memory_channels, + channel_width_in_bytes, threads_per_block, blocks_per_grid ) diff --git a/scripts/playground/scientific_format.py b/scripts/playground/scientific_format.py new file mode 100644 index 0000000..5724dbb --- /dev/null +++ b/scripts/playground/scientific_format.py @@ -0,0 +1,13 @@ +from energysim.utils.scientific_format import scientific_format + +if __name__ == '__main__': + value: float = 1.0 + + for i in range(21): + print(f'Value = {value:.3g} : ' + scientific_format(value)) + value *= 10.0 + + value = 1.0 + for i in range(21): + print(f'Value = {value:.3g} : ' + scientific_format(value)) + value /= 10.0 \ No newline at end of file diff --git a/scripts/playground/spm_matvec.py b/scripts/playground/spm_matvec.py index e1e1625..d26c006 100644 --- a/scripts/playground/spm_matvec.py +++ b/scripts/playground/spm_matvec.py @@ -8,7 +8,9 @@ full = db.load_data('../../data/spm_energy.csv') spm_energies = db.lookupEnergySet('n14l', 64) print(spm_energies) - spm_config = StoredProgramMachineConfiguration(DesignCategory.HighVolume,2.5, 3.2, 64) - spm_metrics = flat_matvec_spm(16, 16, spm_energies, spm_config) + rows = 1024*1024 + cols = 1024 + spm_config = StoredProgramMachineConfiguration(DesignCategory.HighVolume,2.5, 3.2, 64, 64, 4) + spm_metrics = flat_matvec_spm(rows, cols, spm_energies, spm_config) spm_metrics.report() diff --git a/scripts/shmoo/design_space.py b/scripts/shmoo/design_space.py index f4ebc70..78618f8 100644 --- a/scripts/shmoo/design_space.py +++ b/scripts/shmoo/design_space.py @@ -56,7 +56,7 @@ def randomize_spm(nr_samples: int, process_node: str): 'category': [category], 'core_clock_ghz': [processor_clock_ghz], 'memory_clock_ghz': [memory_clock_ghz], - 'performance': [spm_metrics.TIPS], + 'performance': [spm_metrics.instr_per_sec], 'energy': [spm_metrics.occurrence_energy('total')* 1.0e-6]} # renormalize pJ to microJ ) df = pd.concat([df,new_row], ignore_index=True) diff --git a/scripts/shmoo/sample_gpu.py b/scripts/shmoo/sample_gpu.py index 6ae96a7..f734226 100644 --- a/scripts/shmoo/sample_gpu.py +++ b/scripts/shmoo/sample_gpu.py @@ -8,7 +8,17 @@ from energysim.operator.flat_matvec import flat_matvec_gpu -def sample_gpu(process_node: str, category: 'DesignCategory', core_clock_ghz: float, memory_clock_ghz: float, word_size_in_bits: int, memory_burst_in_bytes: int,threads_per_block: int, blocks_per_grid: int) -> 'GraphicsProcessingUnitMetrics': +def sample_gpu(process_node: str, + category: 'DesignCategory', + core_clock_ghz: float, + memory_clock_ghz: float, + word_size_in_bytes: int, + cache_line_size_in_bytes: int, + memory_burst_size_in_bytes: int, + memory_channels: int, + channel_width_in_bytes: int, + threads_per_block: int, + blocks_per_grid: int) -> 'GraphicsProcessingUnitMetrics': db = GraphicsProcessingUnitEnergyDatabase() full = db.load_data('../../data/gpu_energy.csv') # this returns a full DataFrame, but we ignore it #print(full) @@ -21,13 +31,20 @@ def sample_gpu(process_node: str, category: 'DesignCategory', core_clock_ghz: fl if selected_node is None: raise ValueError(f'Process {selected_node} not supported') - sample_name = process_node + '_sample' + #cache_line_size_in_bytes typically 32 or 64 + #word_size_in_bytes 4 bytes for single precision, 2 bytes for half, and 1 byte for FP8 + #memory_burst_size_in_bytes typically can be 32b, 64b, 128bytes + #memory_channels = 4 + #channel_width_in_bytes = LPDDR tends to be 2 bytes, DDR and GDDR tend to be 8 bytes, HBM is 128 bytes config = GraphicsProcessingUnitConfiguration( category, core_clock_ghz, memory_clock_ghz, - word_size_in_bits, - memory_burst_in_bytes, + word_size_in_bytes, + cache_line_size_in_bytes, + memory_burst_size_in_bytes, + memory_channels, + channel_width_in_bytes, threads_per_block, blocks_per_grid ) @@ -67,8 +84,11 @@ def randomize_gpu(nr_samples: int, process_node: str): gpu_configs = [] midrange_index = (nr_samples // 3) - 1 highperformance_index = (2 * nr_samples // 3) - word_size_in_bits = 32 + cache_line_size_in_bytes = 64 + word_size_in_bytes = 4 memory_burst_size_in_bytes = 64 # typically can be 32b, 64b, 128bytes + memory_channels = 4 + channel_width_in_bytes = 8 # LPDDR tends to be 2 bytes, DDR and GDDR tend to be 8 bytes, HBM is 128 bytes # Configure grid and block dimensions for a one-dimensional blocking: one thread per row-col dot product threads_per_block = 128 blocks_per_grid = (rows + threads_per_block - 1) // threads_per_block @@ -77,8 +97,11 @@ def randomize_gpu(nr_samples: int, process_node: str): category, core_clock_ghz, memory_clock_ghz, - word_size_in_bits, + word_size_in_bytes, + cache_line_size_in_bytes, memory_burst_size_in_bytes, + memory_channels, + channel_width_in_bytes, threads_per_block, blocks_per_grid ) diff --git a/scripts/shmoo/sample_spm.py b/scripts/shmoo/sample_spm.py index 4d8839b..2a809a1 100644 --- a/scripts/shmoo/sample_spm.py +++ b/scripts/shmoo/sample_spm.py @@ -60,7 +60,7 @@ def randomize_spm(nr_samples: int, process_node: str): 'category': [category], 'processor_core_ghz': [processor_clock_ghz], 'memory_core': [memory_clock_ghz], - 'performance': [spm_metrics.TIPS], + 'performance': [spm_metrics.instr_per_sec], 'energy': [spm_metrics.occurrence_energy('total')* 1.0e-6]} # renormalize pJ to microJ ) df = pd.concat([df,new_row], ignore_index=True)