From 581377bed5a047c653bcfd5425b7e3eaf575cb4a Mon Sep 17 00:00:00 2001 From: notoraptor Date: Mon, 27 Jan 2025 10:54:41 -0500 Subject: [PATCH 1/4] Update @satyaog code from PR #115 to harmonize GPU names --- config/sarc-dev.json | 89 ++++++++++++++++++++-- sarc/config.py | 44 ++++++++++- tests/sarc-test.json | 10 ++- tests/unittests/jobs/test_harmonize_gpu.py | 70 +++++++++++++++++ 4 files changed, 205 insertions(+), 8 deletions(-) create mode 100644 tests/unittests/jobs/test_harmonize_gpu.py diff --git a/config/sarc-dev.json b/config/sarc-dev.json index 29eb5fad..98c216ce 100644 --- a/config/sarc-dev.json +++ b/config/sarc-dev.json @@ -28,7 +28,42 @@ "duc_storage_command": null, "diskusage_report_command": "beegfs-ctl --cfgFile=/etc/beegfs/home.d/beegfs-client.conf --getquota --uid $USER --csv", "prometheus_url": "http://prometheus01.server.mila.quebec:9090/", - "start_date": "2022-04-01" + "start_date": "2022-04-01", + "gpus_per_nodes": { + "__DEFAULTS__": { + "rtx8000": "RTX8000" + }, + "cn-b[001-005]": { + "v100": "V100-SXM2-32GB" + }, + "cn-d[001-002]": { + "a100": "A100-SXM4-40GB" + }, + "cn-d[003-004]": { + "a100l": "A100-SXM4-80GB" + }, + "cn-e[002-003]": { + "v100": "V100-SXM2-32GB" + }, + "cn-g[001-029]": { + "a100l": "A100-SXM4-80GB" + }, + "cn-i001": { + "a100l": "A100-PCIe-80GB" + }, + "cn-j001": { + "a6000": "A6000" + }, + "cn-k[001-004]": { + "a100": "A100-SXM4-40GB" + }, + "cn-l[001-091]": { + "l40s": "L40S" + }, + "cn-n[001-002]": { + "h100": "H100-SXM5-80GB" + } + } }, "narval": { "host": "narval.computecanada.ca", @@ -42,7 +77,16 @@ "prometheus_headers_file": "secrets/drac_prometheus/headers.json", "start_date": "2022-04-01", "rgu_start_date": "2023-11-28", - "gpu_to_rgu_billing": "secrets/gpu_to_rgu_billing_narval.json" + "gpu_to_rgu_billing": "secrets/gpu_to_rgu_billing_narval.json", + "gpus_per_nodes": { + "__DEFAULTS__": { + "a100": "A100-SXM4-40GB", + "a100_1g.5gb": "__MIG_FLAG__a100", + "a100_2g.10gb": "__MIG_FLAG__a100", + "a100_3g.20gb": "__MIG_FLAG__a100", + "a100_4g.20gb": "__MIG_FLAG__a100" + } + } }, "beluga": { "host": "beluga.computecanada.ca", @@ -56,7 +100,12 @@ "prometheus_headers_file": "secrets/drac_prometheus/headers.json", "start_date": "2022-04-01", "rgu_start_date": "2024-04-03", - "gpu_to_rgu_billing": "secrets/gpu_to_rgu_billing_beluga.json" + "gpu_to_rgu_billing": "secrets/gpu_to_rgu_billing_beluga.json", + "gpus_per_nodes": { + "__DEFAULTS__": { + "tesla_v100-sxm2-16gb": "V100-SXM2-16GB" + } + } }, "graham": { "host": "graham.computecanada.ca", @@ -70,7 +119,30 @@ "prometheus_headers_file": null, "start_date": "2022-04-01", "rgu_start_date": "2024-04-03", - "gpu_to_rgu_billing": "secrets/gpu_to_rgu_billing_graham.json" + "gpu_to_rgu_billing": "secrets/gpu_to_rgu_billing_graham.json", + "gpus_per_nodes": { + "gra[828-987]": { + "p100": "P100-PCIe-12GB" + }, + "gra[1147-1153]": { + "v100": "V100-PCIe-16GB" + }, + "gra[1154-1189]": { + "t4": "T4" + }, + "gra[1337-1338]": { + "v100": "V100-SXM2-32GB" + }, + "gra1342": { + "a100": "A100-SXM4-80GB" + }, + "gra[1361-1362]": { + "a100": "A100-PCIe-80GB" + }, + "gra[1363-1373]": { + "a5000": "A5000" + } + } }, "cedar": { "host": "cedar.computecanada.ca", @@ -84,7 +156,14 @@ "prometheus_headers_file": null, "start_date": "2022-04-01", "rgu_start_date": "2024-04-03", - "gpu_to_rgu_billing": "secrets/gpu_to_rgu_billing_cedar.json" + "gpu_to_rgu_billing": "secrets/gpu_to_rgu_billing_cedar.json", + "gpus_per_nodes": { + "__DEFAULTS__": { + "p100": "P100-PCIe-12GB", + "p100l": "P100-PCIe-16GB", + "v100l": "V100-PCIe-32GB" + } + } } } } diff --git a/sarc/config.py b/sarc/config.py index 7ab1e679..1bb2ec38 100644 --- a/sarc/config.py +++ b/sarc/config.py @@ -7,13 +7,14 @@ from datetime import date, datetime from functools import cached_property from pathlib import Path -from typing import Any, Union +from typing import Any, Dict, Union import pydantic import tzlocal from bson import ObjectId +from hostlist import expand_hostlist from pydantic import BaseModel as _BaseModel -from pydantic import Extra, validator +from pydantic import Extra, Field, validator MTL = zoneinfo.ZoneInfo("America/Montreal") PST = zoneinfo.ZoneInfo("America/Vancouver") @@ -70,6 +71,10 @@ def replace(self, **replacements): return type(self)(**new_arguments) +MIG_FLAG = "__MIG__" +DEFAULTS_FLAG = "__DEFAULTS__" + + class ClusterConfig(BaseModel): host: str = "localhost" timezone: Union[str, zoneinfo.ZoneInfo] # | does not work with Pydantic's eval @@ -87,6 +92,9 @@ class ClusterConfig(BaseModel): gpu_to_rgu_billing: Path = None slurm_conf_host_path: str = "/etc/slurm/slurm.conf" + # Dictionary mapping a node name -> gpu type -> IGUANE gpu name + gpus_per_nodes: Dict[str, Dict[str, str]] = Field(default_factory=dict) + @validator("timezone") def _timezone(cls, value): if isinstance(value, str): @@ -94,6 +102,38 @@ def _timezone(cls, value): else: return value + @validator("gpus_per_nodes") + def _expand_gpus_per_nodes(cls, value: dict): + # Convert node list to node names with `expand_hostlist` + return { + node: gpu_to_desc + for node_list, gpu_to_desc in value.items() + for node in expand_hostlist(node_list) + } + + def harmonize_gpu(self, nodename: str, gpu_type: str) -> str: + """Actual utility method to get a GPU name from given node and gpu type.""" + + gpu_type = gpu_type.lower().replace(" ", "-").split(":") + if gpu_type[0] == "gpu": + gpu_type.pop(0) + gpu_type = gpu_type[0] + + if nodename in self.gpus_per_nodes: + gpu_map = self.gpus_per_nodes[nodename] + else: + gpu_map = self.gpus_per_nodes.get(DEFAULTS_FLAG, {}) + + harmonized_gpu = gpu_map.get(gpu_type, None) + + if harmonized_gpu and harmonized_gpu.startswith(MIG_FLAG): + harmonized_gpu = self.harmonize_gpu( + nodename, harmonized_gpu[len(MIG_FLAG) :] + ) + harmonized_gpu = f"{harmonized_gpu} : {gpu_type}" + + return harmonized_gpu + @cached_property def ssh(self): from fabric import Config as FabricConfig diff --git a/tests/sarc-test.json b/tests/sarc-test.json index 206b3f7b..d36e981d 100644 --- a/tests/sarc-test.json +++ b/tests/sarc-test.json @@ -41,7 +41,15 @@ "duc_inodes_command": null, "duc_storage_command": null, "diskusage_report_command": null, - "prometheus_url": null + "prometheus_url": null, + "gpus_per_nodes" : { + "cn-c018": { + "asupergpu": "Nec Plus Plus ULTRA GPU 2000" + }, + "cn-c[019-030]": { + "asupergpu": "Nec Plus ULTRA GPU 2000" + } + } }, "fromage": { "host": "fromage", diff --git a/tests/unittests/jobs/test_harmonize_gpu.py b/tests/unittests/jobs/test_harmonize_gpu.py new file mode 100644 index 00000000..a4a5b994 --- /dev/null +++ b/tests/unittests/jobs/test_harmonize_gpu.py @@ -0,0 +1,70 @@ +import pytest + +from sarc.config import DEFAULTS_FLAG, MIG_FLAG, ClusterConfig, config + +GPUS_PER_NODES = { + "node[0-9]": {"gpu1": "DESCRIPTIVE GPU 1"}, + "node[9-19]": {"gpu2": "DESCRIPTIVE GPU 2"}, + "node_mig20": {"gpu3": "DESCRIPTIVE GPU 3", "4g.40gb": f"{MIG_FLAG}gpu3"}, + DEFAULTS_FLAG: {"gpu_default": "DESCRIPTIVE GPU DEFAULT"}, +} + + +@pytest.mark.parametrize( + "node,gpu_type,expected,gpus_per_nodes", + [ + [ + "DoesNotExist", + "DoesNotExist", + None, + {}, + ], + [ + "node1", + "GPU1", + "DESCRIPTIVE GPU 1", + GPUS_PER_NODES, + ], + [ + "node11", + "GPU2", + "DESCRIPTIVE GPU 2", + GPUS_PER_NODES, + ], + [ + "DoesNotExist", + "GPU_DEFAULT", + "DESCRIPTIVE GPU DEFAULT", + GPUS_PER_NODES, + ], + [ + "node1", + "DoesNotExist", + None, + GPUS_PER_NODES, + ], + [ + "node_mig20", + "4g.40gb", + "DESCRIPTIVE GPU 3 : 4g.40gb", + GPUS_PER_NODES, + ], + ], +) +def test_harmonize_gpu(node, gpu_type, expected, gpus_per_nodes): + cluster = ClusterConfig(timezone="America/Montreal", gpus_per_nodes=gpus_per_nodes) + assert cluster.harmonize_gpu(node, gpu_type) == expected + + +@pytest.mark.usefixtures("standard_config") +@pytest.mark.parametrize( + "node,gpu_type,expected", + [ + ("cn-c018", "asupergpu", "Nec Plus Plus ULTRA GPU 2000"), + ("cn-c019", "asupergpu", "Nec Plus ULTRA GPU 2000"), + ("cn-c024", "asupergpu", "Nec Plus ULTRA GPU 2000"), + ], +) +def test_clusterconfig_harmonize_gpu(node, gpu_type, expected): + cluster = config().clusters["raisin_no_prometheus"] + assert cluster.harmonize_gpu(node, gpu_type) == expected From 4e813c6b19e80ca2bdc98308c9749718ea43fe7e Mon Sep 17 00:00:00 2001 From: notoraptor Date: Thu, 30 Jan 2025 10:21:59 -0500 Subject: [PATCH 2/4] Actually use ClusterConfig.harmonize_gpu() to get GPU names during scraping. --- sarc/config.py | 10 ++++++--- sarc/jobs/sacct.py | 22 ++++++++++++++++--- tests/functional/jobs/test_func_sacct.py | 2 +- ...ut_prometheus_json_jobs0_test_config0_.txt | 4 ++-- 4 files changed, 29 insertions(+), 9 deletions(-) diff --git a/sarc/config.py b/sarc/config.py index 1bb2ec38..ebd4b5d4 100644 --- a/sarc/config.py +++ b/sarc/config.py @@ -7,7 +7,7 @@ from datetime import date, datetime from functools import cached_property from pathlib import Path -from typing import Any, Dict, Union +from typing import Any, Dict, Optional, Union import pydantic import tzlocal @@ -111,8 +111,12 @@ def _expand_gpus_per_nodes(cls, value: dict): for node in expand_hostlist(node_list) } - def harmonize_gpu(self, nodename: str, gpu_type: str) -> str: - """Actual utility method to get a GPU name from given node and gpu type.""" + def harmonize_gpu(self, nodename: str, gpu_type: str) -> Optional[str]: + """ + Actual utility method to get a GPU name from given node and gpu type. + + Return None if GPU name cannot be inferred. + """ gpu_type = gpu_type.lower().replace(" ", "-").split(":") if gpu_type[0] == "gpu": diff --git a/sarc/jobs/sacct.py b/sarc/jobs/sacct.py index b244c89a..551e213c 100644 --- a/sarc/jobs/sacct.py +++ b/sarc/jobs/sacct.py @@ -297,6 +297,8 @@ def update_allocated_gpu_type(cluster: ClusterConfig, entry: SlurmJob) -> Option None Unable to infer gpu type. """ + gpu_type = None + if cluster.prometheus_url: # Cluster does have prometheus config. output = get_job_time_series( @@ -306,9 +308,9 @@ def update_allocated_gpu_type(cluster: ClusterConfig, entry: SlurmJob) -> Option dataframe=False, ) if output: - entry.allocated.gpu_type = output[0]["metric"]["gpu_type"] + gpu_type = output[0]["metric"]["gpu_type"] else: - # No prometheus config. Try to get GPU type from local JSON file. + # No prometheus config. Try to get GPU type from entry nodes. node_gpu_mapping = get_node_to_gpu(cluster.name, entry.start_time) if node_gpu_mapping: node_to_gpu = node_gpu_mapping.node_to_gpu @@ -320,6 +322,20 @@ def update_allocated_gpu_type(cluster: ClusterConfig, entry: SlurmJob) -> Option # We should not have more than 1 GPU type per job. assert len(gpu_types) <= 1 if gpu_types: - entry.allocated.gpu_type = gpu_types.pop() + gpu_type = gpu_types.pop() + + # If we found a GPU type, try to infer descriptive GPU name + if gpu_type is not None: + harmonized_gpu_names = { + cluster.harmonize_gpu(nodename, gpu_type) for nodename in entry.nodes + } + # If present, remove None from GPU names + harmonized_gpu_names.discard(None) + # If we got 1 GPU name, use it. + # Otherwise, keep default found gpu_type. + if len(harmonized_gpu_names) == 1: + gpu_type = harmonized_gpu_names.pop() + # Finally, save gpu_type into job object. + entry.allocated.gpu_type = gpu_type return entry.allocated.gpu_type diff --git a/tests/functional/jobs/test_func_sacct.py b/tests/functional/jobs/test_func_sacct.py index c471871c..ca6c8eda 100644 --- a/tests/functional/jobs/test_func_sacct.py +++ b/tests/functional/jobs/test_func_sacct.py @@ -472,7 +472,7 @@ def test_get_gpu_type_without_prometheus( job = jobs[0] print(job) print(job.nodes) - assert job.allocated.gpu_type == "gpu:asupergpu:4" + assert job.allocated.gpu_type == "Nec Plus ULTRA GPU 2000" file_regression.check( f"Found {len(jobs)} job(s):\n" diff --git a/tests/functional/jobs/test_func_sacct/test_get_gpu_type_without_prometheus_json_jobs0_test_config0_.txt b/tests/functional/jobs/test_func_sacct/test_get_gpu_type_without_prometheus_json_jobs0_test_config0_.txt index 3317375f..71f97362 100644 --- a/tests/functional/jobs/test_func_sacct/test_get_gpu_type_without_prometheus_json_jobs0_test_config0_.txt +++ b/tests/functional/jobs/test_func_sacct/test_get_gpu_type_without_prometheus_json_jobs0_test_config0_.txt @@ -42,7 +42,7 @@ Found 1 job(s): "node": 1, "billing": 1, "gres_gpu": 1, - "gpu_type": "gpu:asupergpu:4" + "gpu_type": "Nec Plus ULTRA GPU 2000" }, "stored_statistics": null -} \ No newline at end of file +} From 642f10e9945e312986c2a1c94dae4a00cc4d041f Mon Sep 17 00:00:00 2001 From: notoraptor Date: Thu, 30 Jan 2025 10:41:32 -0500 Subject: [PATCH 3/4] reformat code --- sarc/users/revision.py | 2 +- tox.ini | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/sarc/users/revision.py b/sarc/users/revision.py index 26642b34..efdc506f 100644 --- a/sarc/users/revision.py +++ b/sarc/users/revision.py @@ -9,7 +9,7 @@ - All current documents have NO end date - this makes query simple as we can just look for missing end date, because previous db had no revision system none of the documents have - end dates, so all their documents are the current ones + end dates, so all their documents are the current ones - All past version have an end date """ diff --git a/tox.ini b/tox.ini index 11977a54..ca01dd25 100644 --- a/tox.ini +++ b/tox.ini @@ -6,13 +6,13 @@ envlist = py310 [testenv:test] description = run tests -deps = +deps = poetry allowlist_externals = podman -commands_pre = +commands_pre = podman run -dt --name testenv_mongo -p 27017:27017/tcp docker.io/library/mongo:latest -commands = +commands = poetry install --with dev poetry run coverage run --source sarc --parallel-mode -m pytest --doctest-modules --durations=50 --durations-min 1 -vv --timeout=20 -vvv tests/ {posargs} poetry run coverage combine @@ -25,19 +25,19 @@ commands_post = description = run linters deps = pylint -commands = +commands = pylint sarc [testenv:black] description = run linters deps = black -commands = +commands = black --check . [testenv:isort] description = run linters deps = isort -commands = +commands = isort -c --df --profile black . From 0d4a497d4947ac5b246c9a3e0e03e9f170818cf4 Mon Sep 17 00:00:00 2001 From: notoraptor Date: Mon, 3 Feb 2025 07:42:12 -0500 Subject: [PATCH 4/4] Add supplementary harmonized GPUs Fix and improve harmonization code --- config/sarc-dev.json | 25 +++++++++++++++++++++---- sarc/config.py | 16 ++++++++++------ 2 files changed, 31 insertions(+), 10 deletions(-) diff --git a/config/sarc-dev.json b/config/sarc-dev.json index 98c216ce..f695a9b7 100644 --- a/config/sarc-dev.json +++ b/config/sarc-dev.json @@ -31,7 +31,16 @@ "start_date": "2022-04-01", "gpus_per_nodes": { "__DEFAULTS__": { - "rtx8000": "RTX8000" + "rtx8000": "RTX8000", + "NVIDIA A100 80GB PCIe": "A100-PCIe-80GB", + "NVIDIA A100-SXM4-40GB": "A100-SXM4-40GB", + "NVIDIA A100-SXM4-80GB": "A100-SXM4-80GB", + "NVIDIA H100 80GB HBM3": "H100-SXM5-80GB", + "NVIDIA L40S": "L40S", + "NVIDIA RTX A6000": "A6000", + "Quadro RTX 8000": "RTX8000", + "Tesla V100-SXM2-32GB": "V100-SXM2-32GB", + "Tesla V100-SXM2-32GB-LS": "V100-SXM2-32GB" }, "cn-b[001-005]": { "v100": "V100-SXM2-32GB" @@ -46,7 +55,10 @@ "v100": "V100-SXM2-32GB" }, "cn-g[001-029]": { - "a100l": "A100-SXM4-80GB" + "a100l": "A100-SXM4-80GB", + "2g.20gb": "__MIG_FLAG__a100l", + "3g.40gb": "__MIG_FLAG__a100l", + "4g.40gb": "__MIG_FLAG__a100l" }, "cn-i001": { "a100l": "A100-PCIe-80GB" @@ -83,8 +95,12 @@ "a100": "A100-SXM4-40GB", "a100_1g.5gb": "__MIG_FLAG__a100", "a100_2g.10gb": "__MIG_FLAG__a100", + "2g.10gb": "__MIG_FLAG__a100", "a100_3g.20gb": "__MIG_FLAG__a100", - "a100_4g.20gb": "__MIG_FLAG__a100" + "3g.20gb": "__MIG_FLAG__a100", + "a100_4g.20gb": "__MIG_FLAG__a100", + "4g.20gb": "__MIG_FLAG__a100", + "NVIDIA A100-SXM4-40GB": "A100-SXM4-40GB" } } }, @@ -103,7 +119,8 @@ "gpu_to_rgu_billing": "secrets/gpu_to_rgu_billing_beluga.json", "gpus_per_nodes": { "__DEFAULTS__": { - "tesla_v100-sxm2-16gb": "V100-SXM2-16GB" + "tesla_v100-sxm2-16gb": "V100-SXM2-16GB", + "Tesla V100-SXM2-16GB": "V100-SXM2-16GB" } } }, diff --git a/sarc/config.py b/sarc/config.py index ebd4b5d4..3d8fb210 100644 --- a/sarc/config.py +++ b/sarc/config.py @@ -106,7 +106,10 @@ def _timezone(cls, value): def _expand_gpus_per_nodes(cls, value: dict): # Convert node list to node names with `expand_hostlist` return { - node: gpu_to_desc + node: { + gpu_type.lower().replace(" ", "-"): gpu_desc + for gpu_type, gpu_desc in gpu_to_desc.items() + } for node_list, gpu_to_desc in value.items() for node in expand_hostlist(node_list) } @@ -123,13 +126,14 @@ def harmonize_gpu(self, nodename: str, gpu_type: str) -> Optional[str]: gpu_type.pop(0) gpu_type = gpu_type[0] - if nodename in self.gpus_per_nodes: - gpu_map = self.gpus_per_nodes[nodename] - else: - gpu_map = self.gpus_per_nodes.get(DEFAULTS_FLAG, {}) + # Try to get harmonized GPU from nodename mapping + harmonized_gpu = self.gpus_per_nodes.get(nodename, {}).get(gpu_type) - harmonized_gpu = gpu_map.get(gpu_type, None) + # Otherwise, try to get harmonized GPU from default mapping + if harmonized_gpu is None: + harmonized_gpu = self.gpus_per_nodes.get(DEFAULTS_FLAG, {}).get(gpu_type) + # For MIG GPUs, use this method recursively if harmonized_gpu and harmonized_gpu.startswith(MIG_FLAG): harmonized_gpu = self.harmonize_gpu( nodename, harmonized_gpu[len(MIG_FLAG) :]