mila-iqia · notoraptor · Jan 27, 2025 · Jan 30, 2025 · Jan 30, 2025 · Feb 3, 2025
diff --git a/config/sarc-dev.json b/config/sarc-dev.json
@@ -28,7 +28,54 @@
             "duc_storage_command": null,
             "diskusage_report_command": "beegfs-ctl --cfgFile=/etc/beegfs/home.d/beegfs-client.conf --getquota --uid $USER --csv",
             "prometheus_url": "http://prometheus01.server.mila.quebec:9090/",
-            "start_date": "2022-04-01"
+            "start_date": "2022-04-01",
+            "gpus_per_nodes": {
+                "__DEFAULTS__": {
+                    "rtx8000": "RTX8000",
+                    "NVIDIA A100 80GB PCIe": "A100-PCIe-80GB",
+                    "NVIDIA A100-SXM4-40GB": "A100-SXM4-40GB",
+                    "NVIDIA A100-SXM4-80GB": "A100-SXM4-80GB",
+                    "NVIDIA H100 80GB HBM3": "H100-SXM5-80GB",
+                    "NVIDIA L40S": "L40S",
+                    "NVIDIA RTX A6000": "A6000",
+                    "Quadro RTX 8000": "RTX8000",
+                    "Tesla V100-SXM2-32GB": "V100-SXM2-32GB",
+                    "Tesla V100-SXM2-32GB-LS": "V100-SXM2-32GB"
+                },
+                "cn-b[001-005]": {
+                    "v100": "V100-SXM2-32GB"
+                },
+                "cn-d[001-002]": {
+                    "a100": "A100-SXM4-40GB"
+                },
+                "cn-d[003-004]": {
+                    "a100l": "A100-SXM4-80GB"
+                },
+                "cn-e[002-003]": {
+                    "v100": "V100-SXM2-32GB"
+                },
+                "cn-g[001-029]": {
+                    "a100l": "A100-SXM4-80GB",
+                    "2g.20gb": "__MIG_FLAG__a100l",
+                    "3g.40gb": "__MIG_FLAG__a100l",
+                    "4g.40gb": "__MIG_FLAG__a100l"
+                },
+                "cn-i001": {
+                    "a100l": "A100-PCIe-80GB"
+                },
+                "cn-j001": {
+                    "a6000": "A6000"
+                },
+                "cn-k[001-004]": {
+                    "a100": "A100-SXM4-40GB"
+                },
+                "cn-l[001-091]": {
+                    "l40s": "L40S"
+                },
+                "cn-n[001-002]": {
+                    "h100": "H100-SXM5-80GB"
+                }
+            }
         },
         "narval": {
             "host": "narval.computecanada.ca",
@@ -42,7 +89,20 @@
             "prometheus_headers_file": "secrets/drac_prometheus/headers.json",
             "start_date": "2022-04-01",
             "rgu_start_date": "2023-11-28",
-            "gpu_to_rgu_billing": "secrets/gpu_to_rgu_billing_narval.json"
+            "gpu_to_rgu_billing": "secrets/gpu_to_rgu_billing_narval.json",
+            "gpus_per_nodes": {
+                "__DEFAULTS__": {
+                    "a100": "A100-SXM4-40GB",
+                    "a100_1g.5gb": "__MIG_FLAG__a100",
+                    "a100_2g.10gb": "__MIG_FLAG__a100",
+                    "2g.10gb": "__MIG_FLAG__a100",
+                    "a100_3g.20gb": "__MIG_FLAG__a100",
+                    "3g.20gb": "__MIG_FLAG__a100",
+                    "a100_4g.20gb": "__MIG_FLAG__a100",
+                    "4g.20gb": "__MIG_FLAG__a100",
+                    "NVIDIA A100-SXM4-40GB": "A100-SXM4-40GB"
+                }
+            }
         },
         "beluga": {
             "host": "beluga.computecanada.ca",
@@ -56,7 +116,13 @@
             "prometheus_headers_file": "secrets/drac_prometheus/headers.json",
             "start_date": "2022-04-01",
             "rgu_start_date": "2024-04-03",
-            "gpu_to_rgu_billing": "secrets/gpu_to_rgu_billing_beluga.json"
+            "gpu_to_rgu_billing": "secrets/gpu_to_rgu_billing_beluga.json",
+            "gpus_per_nodes": {
+                "__DEFAULTS__": {
+                    "tesla_v100-sxm2-16gb": "V100-SXM2-16GB",
+                    "Tesla V100-SXM2-16GB": "V100-SXM2-16GB"
+                }
+            }
         },
         "graham": {
             "host": "graham.computecanada.ca",
@@ -70,7 +136,30 @@
             "prometheus_headers_file": null,
             "start_date": "2022-04-01",
             "rgu_start_date": "2024-04-03",
-            "gpu_to_rgu_billing": "secrets/gpu_to_rgu_billing_graham.json"
+            "gpu_to_rgu_billing": "secrets/gpu_to_rgu_billing_graham.json",
+            "gpus_per_nodes": {
+                "gra[828-987]": {
+                    "p100": "P100-PCIe-12GB"
+                },
+                "gra[1147-1153]": {
+                    "v100": "V100-PCIe-16GB"
+                },
+                "gra[1154-1189]": {
+                    "t4": "T4"
+                },
+                "gra[1337-1338]": {
+                    "v100": "V100-SXM2-32GB"
+                },
+                "gra1342": {
+                    "a100": "A100-SXM4-80GB"
+                },
+                "gra[1361-1362]": {
+                    "a100": "A100-PCIe-80GB"
+                },
+                "gra[1363-1373]": {
+                    "a5000": "A5000"
+                }
+            }
         },
         "cedar": {
             "host": "cedar.computecanada.ca",
@@ -84,7 +173,14 @@
             "prometheus_headers_file": null,
             "start_date": "2022-04-01",
             "rgu_start_date": "2024-04-03",
-            "gpu_to_rgu_billing": "secrets/gpu_to_rgu_billing_cedar.json"
+            "gpu_to_rgu_billing": "secrets/gpu_to_rgu_billing_cedar.json",
+            "gpus_per_nodes": {
+                "__DEFAULTS__": {
+                    "p100": "P100-PCIe-12GB",
+                    "p100l": "P100-PCIe-16GB",
+                    "v100l": "V100-PCIe-32GB"
+                }
+            }
         }
     }
 }
diff --git a/sarc/config.py b/sarc/config.py
@@ -7,13 +7,14 @@
 from datetime import date, datetime
 from functools import cached_property
 from pathlib import Path
-from typing import Any, Union
+from typing import Any, Dict, Optional, Union
 
 import pydantic
 import tzlocal
 from bson import ObjectId
+from hostlist import expand_hostlist
 from pydantic import BaseModel as _BaseModel
-from pydantic import Extra, validator
+from pydantic import Extra, Field, validator
 
 MTL = zoneinfo.ZoneInfo("America/Montreal")
 PST = zoneinfo.ZoneInfo("America/Vancouver")
@@ -70,6 +71,10 @@ def replace(self, **replacements):
         return type(self)(**new_arguments)
 
 
+MIG_FLAG = "__MIG__"
+DEFAULTS_FLAG = "__DEFAULTS__"
+
+
 class ClusterConfig(BaseModel):
     host: str = "localhost"
     timezone: Union[str, zoneinfo.ZoneInfo]  # | does not work with Pydantic's eval
@@ -87,13 +92,56 @@ class ClusterConfig(BaseModel):
     gpu_to_rgu_billing: Path = None
     slurm_conf_host_path: str = "/etc/slurm/slurm.conf"
 
+    # Dictionary mapping a node name -> gpu type -> IGUANE gpu name
+    gpus_per_nodes: Dict[str, Dict[str, str]] = Field(default_factory=dict)
+
     @validator("timezone")
     def _timezone(cls, value):
         if isinstance(value, str):
             return zoneinfo.ZoneInfo(value)
         else:
             return value
 
+    @validator("gpus_per_nodes")
+    def _expand_gpus_per_nodes(cls, value: dict):
+        # Convert node list to node names with `expand_hostlist`
+        return {
+            node: {
+                gpu_type.lower().replace(" ", "-"): gpu_desc
+                for gpu_type, gpu_desc in gpu_to_desc.items()
+            }
+            for node_list, gpu_to_desc in value.items()
+            for node in expand_hostlist(node_list)
+        }
+
+    def harmonize_gpu(self, nodename: str, gpu_type: str) -> Optional[str]:
+        """
+        Actual utility method to get a GPU name from given node and gpu type.
+
+        Return None if GPU name cannot be inferred.
+        """
+
+        gpu_type = gpu_type.lower().replace(" ", "-").split(":")
+        if gpu_type[0] == "gpu":
+            gpu_type.pop(0)
+        gpu_type = gpu_type[0]
+
+        # Try to get harmonized GPU from nodename mapping
+        harmonized_gpu = self.gpus_per_nodes.get(nodename, {}).get(gpu_type)
+
+        # Otherwise, try to get harmonized GPU from default mapping
+        if harmonized_gpu is None:
+            harmonized_gpu = self.gpus_per_nodes.get(DEFAULTS_FLAG, {}).get(gpu_type)
+
+        # For MIG GPUs, use this method recursively
+        if harmonized_gpu and harmonized_gpu.startswith(MIG_FLAG):
+            harmonized_gpu = self.harmonize_gpu(
+                nodename, harmonized_gpu[len(MIG_FLAG) :]
+            )
+            harmonized_gpu = f"{harmonized_gpu} : {gpu_type}"
+
+        return harmonized_gpu
+
     @cached_property
     def ssh(self):
         from fabric import Config as FabricConfig

diff --git a/sarc/jobs/sacct.py b/sarc/jobs/sacct.py
@@ -297,6 +297,8 @@ def update_allocated_gpu_type(cluster: ClusterConfig, entry: SlurmJob) -> Option
     None
         Unable to infer gpu type.
     """
+    gpu_type = None
+
     if cluster.prometheus_url:
         # Cluster does have prometheus config.
         output = get_job_time_series(
@@ -306,9 +308,9 @@ def update_allocated_gpu_type(cluster: ClusterConfig, entry: SlurmJob) -> Option
             dataframe=False,
         )
         if output:
-            entry.allocated.gpu_type = output[0]["metric"]["gpu_type"]
+            gpu_type = output[0]["metric"]["gpu_type"]
     else:
-        # No prometheus config. Try to get GPU type from local JSON file.
+        # No prometheus config. Try to get GPU type from entry nodes.
         node_gpu_mapping = get_node_to_gpu(cluster.name, entry.start_time)
         if node_gpu_mapping:
             node_to_gpu = node_gpu_mapping.node_to_gpu
@@ -320,6 +322,20 @@ def update_allocated_gpu_type(cluster: ClusterConfig, entry: SlurmJob) -> Option
             # We should not have more than 1 GPU type per job.
             assert len(gpu_types) <= 1
             if gpu_types:
-                entry.allocated.gpu_type = gpu_types.pop()
+                gpu_type = gpu_types.pop()
+
+    # If we found a GPU type, try to infer descriptive GPU name
+    if gpu_type is not None:
+        harmonized_gpu_names = {
+            cluster.harmonize_gpu(nodename, gpu_type) for nodename in entry.nodes
+        }
+        # If present, remove None from GPU names
+        harmonized_gpu_names.discard(None)
+        # If we got 1 GPU name, use it.
+        # Otherwise, keep default found gpu_type.
+        if len(harmonized_gpu_names) == 1:
+            gpu_type = harmonized_gpu_names.pop()
+        # Finally, save gpu_type into job object.
+        entry.allocated.gpu_type = gpu_type
 
     return entry.allocated.gpu_type
diff --git a/sarc/users/revision.py b/sarc/users/revision.py
@@ -9,7 +9,7 @@
 - All current documents have NO end date
     - this makes query simple as we can just look for missing end date,
       because previous db had no revision system none of the documents have
-      end dates, so all their documents are the current ones 
+      end dates, so all their documents are the current ones
 
 - All past version have an end date
 """

diff --git a/tests/functional/jobs/test_func_sacct.py b/tests/functional/jobs/test_func_sacct.py
@@ -472,7 +472,7 @@ def test_get_gpu_type_without_prometheus(
     job = jobs[0]
     print(job)
     print(job.nodes)
-    assert job.allocated.gpu_type == "gpu:asupergpu:4"
+    assert job.allocated.gpu_type == "Nec Plus ULTRA GPU 2000"
 
     file_regression.check(
         f"Found {len(jobs)} job(s):\n"

diff --git a/...al/jobs/test_func_sacct/test_get_gpu_type_without_prometheus_json_jobs0_test_config0_.txt b/...al/jobs/test_func_sacct/test_get_gpu_type_without_prometheus_json_jobs0_test_config0_.txt
@@ -42,7 +42,7 @@ Found 1 job(s):
         "node": 1,
         "billing": 1,
         "gres_gpu": 1,
-        "gpu_type": "gpu:asupergpu:4"
+        "gpu_type": "Nec Plus ULTRA GPU 2000"
     },
     "stored_statistics": null
-}
+}
diff --git a/tests/sarc-test.json b/tests/sarc-test.json
@@ -41,7 +41,15 @@
             "duc_inodes_command": null,
             "duc_storage_command": null,
             "diskusage_report_command": null,
-            "prometheus_url": null
+            "prometheus_url": null,
+            "gpus_per_nodes" : {
+                "cn-c018": {
+                    "asupergpu": "Nec Plus Plus ULTRA GPU 2000"
+                },
+                "cn-c[019-030]": {
+                    "asupergpu": "Nec Plus ULTRA GPU 2000"
+                }
+            }
         },
         "fromage": {
             "host": "fromage",

diff --git a/tests/unittests/jobs/test_harmonize_gpu.py b/tests/unittests/jobs/test_harmonize_gpu.py
@@ -0,0 +1,70 @@
+import pytest
+
+from sarc.config import DEFAULTS_FLAG, MIG_FLAG, ClusterConfig, config
+
+GPUS_PER_NODES = {
+    "node[0-9]": {"gpu1": "DESCRIPTIVE GPU 1"},
+    "node[9-19]": {"gpu2": "DESCRIPTIVE GPU 2"},
+    "node_mig20": {"gpu3": "DESCRIPTIVE GPU 3", "4g.40gb": f"{MIG_FLAG}gpu3"},
+    DEFAULTS_FLAG: {"gpu_default": "DESCRIPTIVE GPU DEFAULT"},
+}
+
+
+@pytest.mark.parametrize(
+    "node,gpu_type,expected,gpus_per_nodes",
+    [
+        [
+            "DoesNotExist",
+            "DoesNotExist",
+            None,
+            {},
+        ],
+        [
+            "node1",
+            "GPU1",
+            "DESCRIPTIVE GPU 1",
+            GPUS_PER_NODES,
+        ],
+        [
+            "node11",
+            "GPU2",
+            "DESCRIPTIVE GPU 2",
+            GPUS_PER_NODES,
+        ],
+        [
+            "DoesNotExist",
+            "GPU_DEFAULT",
+            "DESCRIPTIVE GPU DEFAULT",
+            GPUS_PER_NODES,
+        ],
+        [
+            "node1",
+            "DoesNotExist",
+            None,
+            GPUS_PER_NODES,
+        ],
+        [
+            "node_mig20",
+            "4g.40gb",
+            "DESCRIPTIVE GPU 3 : 4g.40gb",
+            GPUS_PER_NODES,
+        ],
+    ],
+)
+def test_harmonize_gpu(node, gpu_type, expected, gpus_per_nodes):
+    cluster = ClusterConfig(timezone="America/Montreal", gpus_per_nodes=gpus_per_nodes)
+    assert cluster.harmonize_gpu(node, gpu_type) == expected
+
+
+@pytest.mark.usefixtures("standard_config")
+@pytest.mark.parametrize(
+    "node,gpu_type,expected",
+    [
+        ("cn-c018", "asupergpu", "Nec Plus Plus ULTRA GPU 2000"),
+        ("cn-c019", "asupergpu", "Nec Plus ULTRA GPU 2000"),
+        ("cn-c024", "asupergpu", "Nec Plus ULTRA GPU 2000"),
+    ],
+)
+def test_clusterconfig_harmonize_gpu(node, gpu_type, expected):
+    cluster = config().clusters["raisin_no_prometheus"]
+    assert cluster.harmonize_gpu(node, gpu_type) == expected