Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SARC-312] Update @satyaog code from PR #115 to harmonize GPU names #148

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
106 changes: 101 additions & 5 deletions config/sarc-dev.json
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,54 @@
"duc_storage_command": null,
"diskusage_report_command": "beegfs-ctl --cfgFile=/etc/beegfs/home.d/beegfs-client.conf --getquota --uid $USER --csv",
"prometheus_url": "http://prometheus01.server.mila.quebec:9090/",
"start_date": "2022-04-01"
"start_date": "2022-04-01",
"gpus_per_nodes": {
"__DEFAULTS__": {
"rtx8000": "RTX8000",
"NVIDIA A100 80GB PCIe": "A100-PCIe-80GB",
"NVIDIA A100-SXM4-40GB": "A100-SXM4-40GB",
"NVIDIA A100-SXM4-80GB": "A100-SXM4-80GB",
"NVIDIA H100 80GB HBM3": "H100-SXM5-80GB",
"NVIDIA L40S": "L40S",
"NVIDIA RTX A6000": "A6000",
"Quadro RTX 8000": "RTX8000",
"Tesla V100-SXM2-32GB": "V100-SXM2-32GB",
"Tesla V100-SXM2-32GB-LS": "V100-SXM2-32GB"
},
"cn-b[001-005]": {
"v100": "V100-SXM2-32GB"
},
"cn-d[001-002]": {
"a100": "A100-SXM4-40GB"
},
"cn-d[003-004]": {
"a100l": "A100-SXM4-80GB"
},
"cn-e[002-003]": {
"v100": "V100-SXM2-32GB"
},
"cn-g[001-029]": {
"a100l": "A100-SXM4-80GB",
"2g.20gb": "__MIG_FLAG__a100l",
"3g.40gb": "__MIG_FLAG__a100l",
"4g.40gb": "__MIG_FLAG__a100l"
},
"cn-i001": {
"a100l": "A100-PCIe-80GB"
},
"cn-j001": {
"a6000": "A6000"
},
"cn-k[001-004]": {
"a100": "A100-SXM4-40GB"
},
"cn-l[001-091]": {
"l40s": "L40S"
},
"cn-n[001-002]": {
"h100": "H100-SXM5-80GB"
}
}
},
"narval": {
"host": "narval.computecanada.ca",
Expand All @@ -42,7 +89,20 @@
"prometheus_headers_file": "secrets/drac_prometheus/headers.json",
"start_date": "2022-04-01",
"rgu_start_date": "2023-11-28",
"gpu_to_rgu_billing": "secrets/gpu_to_rgu_billing_narval.json"
"gpu_to_rgu_billing": "secrets/gpu_to_rgu_billing_narval.json",
"gpus_per_nodes": {
"__DEFAULTS__": {
"a100": "A100-SXM4-40GB",
"a100_1g.5gb": "__MIG_FLAG__a100",
"a100_2g.10gb": "__MIG_FLAG__a100",
"2g.10gb": "__MIG_FLAG__a100",
"a100_3g.20gb": "__MIG_FLAG__a100",
"3g.20gb": "__MIG_FLAG__a100",
"a100_4g.20gb": "__MIG_FLAG__a100",
"4g.20gb": "__MIG_FLAG__a100",
"NVIDIA A100-SXM4-40GB": "A100-SXM4-40GB"
}
}
},
"beluga": {
"host": "beluga.computecanada.ca",
Expand All @@ -56,7 +116,13 @@
"prometheus_headers_file": "secrets/drac_prometheus/headers.json",
"start_date": "2022-04-01",
"rgu_start_date": "2024-04-03",
"gpu_to_rgu_billing": "secrets/gpu_to_rgu_billing_beluga.json"
"gpu_to_rgu_billing": "secrets/gpu_to_rgu_billing_beluga.json",
"gpus_per_nodes": {
"__DEFAULTS__": {
"tesla_v100-sxm2-16gb": "V100-SXM2-16GB",
"Tesla V100-SXM2-16GB": "V100-SXM2-16GB"
}
}
},
"graham": {
"host": "graham.computecanada.ca",
Expand All @@ -70,7 +136,30 @@
"prometheus_headers_file": null,
"start_date": "2022-04-01",
"rgu_start_date": "2024-04-03",
"gpu_to_rgu_billing": "secrets/gpu_to_rgu_billing_graham.json"
"gpu_to_rgu_billing": "secrets/gpu_to_rgu_billing_graham.json",
"gpus_per_nodes": {
"gra[828-987]": {
"p100": "P100-PCIe-12GB"
},
"gra[1147-1153]": {
"v100": "V100-PCIe-16GB"
},
"gra[1154-1189]": {
"t4": "T4"
},
"gra[1337-1338]": {
"v100": "V100-SXM2-32GB"
},
"gra1342": {
"a100": "A100-SXM4-80GB"
},
"gra[1361-1362]": {
"a100": "A100-PCIe-80GB"
},
"gra[1363-1373]": {
"a5000": "A5000"
}
}
},
"cedar": {
"host": "cedar.computecanada.ca",
Expand All @@ -84,7 +173,14 @@
"prometheus_headers_file": null,
"start_date": "2022-04-01",
"rgu_start_date": "2024-04-03",
"gpu_to_rgu_billing": "secrets/gpu_to_rgu_billing_cedar.json"
"gpu_to_rgu_billing": "secrets/gpu_to_rgu_billing_cedar.json",
"gpus_per_nodes": {
"__DEFAULTS__": {
"p100": "P100-PCIe-12GB",
"p100l": "P100-PCIe-16GB",
"v100l": "V100-PCIe-32GB"
}
}
}
}
}
52 changes: 50 additions & 2 deletions sarc/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,14 @@
from datetime import date, datetime
from functools import cached_property
from pathlib import Path
from typing import Any, Union
from typing import Any, Dict, Optional, Union

import pydantic
import tzlocal
from bson import ObjectId
from hostlist import expand_hostlist
from pydantic import BaseModel as _BaseModel
from pydantic import Extra, validator
from pydantic import Extra, Field, validator

MTL = zoneinfo.ZoneInfo("America/Montreal")
PST = zoneinfo.ZoneInfo("America/Vancouver")
Expand Down Expand Up @@ -70,6 +71,10 @@ def replace(self, **replacements):
return type(self)(**new_arguments)


MIG_FLAG = "__MIG__"
DEFAULTS_FLAG = "__DEFAULTS__"


class ClusterConfig(BaseModel):
host: str = "localhost"
timezone: Union[str, zoneinfo.ZoneInfo] # | does not work with Pydantic's eval
Expand All @@ -87,13 +92,56 @@ class ClusterConfig(BaseModel):
gpu_to_rgu_billing: Path = None
slurm_conf_host_path: str = "/etc/slurm/slurm.conf"

# Dictionary mapping a node name -> gpu type -> IGUANE gpu name
gpus_per_nodes: Dict[str, Dict[str, str]] = Field(default_factory=dict)

@validator("timezone")
def _timezone(cls, value):
if isinstance(value, str):
return zoneinfo.ZoneInfo(value)
else:
return value

@validator("gpus_per_nodes")
def _expand_gpus_per_nodes(cls, value: dict):
# Convert node list to node names with `expand_hostlist`
return {
node: {
gpu_type.lower().replace(" ", "-"): gpu_desc
for gpu_type, gpu_desc in gpu_to_desc.items()
}
for node_list, gpu_to_desc in value.items()
for node in expand_hostlist(node_list)
}

def harmonize_gpu(self, nodename: str, gpu_type: str) -> Optional[str]:
"""
Actual utility method to get a GPU name from given node and gpu type.

Return None if GPU name cannot be inferred.
"""

gpu_type = gpu_type.lower().replace(" ", "-").split(":")
if gpu_type[0] == "gpu":
gpu_type.pop(0)
gpu_type = gpu_type[0]

# Try to get harmonized GPU from nodename mapping
harmonized_gpu = self.gpus_per_nodes.get(nodename, {}).get(gpu_type)

# Otherwise, try to get harmonized GPU from default mapping
if harmonized_gpu is None:
harmonized_gpu = self.gpus_per_nodes.get(DEFAULTS_FLAG, {}).get(gpu_type)

# For MIG GPUs, use this method recursively
if harmonized_gpu and harmonized_gpu.startswith(MIG_FLAG):
harmonized_gpu = self.harmonize_gpu(
nodename, harmonized_gpu[len(MIG_FLAG) :]
)
harmonized_gpu = f"{harmonized_gpu} : {gpu_type}"

return harmonized_gpu

@cached_property
def ssh(self):
from fabric import Config as FabricConfig
Expand Down
22 changes: 19 additions & 3 deletions sarc/jobs/sacct.py
Original file line number Diff line number Diff line change
Expand Up @@ -297,6 +297,8 @@ def update_allocated_gpu_type(cluster: ClusterConfig, entry: SlurmJob) -> Option
None
Unable to infer gpu type.
"""
gpu_type = None

if cluster.prometheus_url:
# Cluster does have prometheus config.
output = get_job_time_series(
Expand All @@ -306,9 +308,9 @@ def update_allocated_gpu_type(cluster: ClusterConfig, entry: SlurmJob) -> Option
dataframe=False,
)
if output:
entry.allocated.gpu_type = output[0]["metric"]["gpu_type"]
gpu_type = output[0]["metric"]["gpu_type"]
else:
# No prometheus config. Try to get GPU type from local JSON file.
# No prometheus config. Try to get GPU type from entry nodes.
node_gpu_mapping = get_node_to_gpu(cluster.name, entry.start_time)
if node_gpu_mapping:
node_to_gpu = node_gpu_mapping.node_to_gpu
Expand All @@ -320,6 +322,20 @@ def update_allocated_gpu_type(cluster: ClusterConfig, entry: SlurmJob) -> Option
# We should not have more than 1 GPU type per job.
assert len(gpu_types) <= 1
if gpu_types:
entry.allocated.gpu_type = gpu_types.pop()
gpu_type = gpu_types.pop()

# If we found a GPU type, try to infer descriptive GPU name
if gpu_type is not None:
harmonized_gpu_names = {
cluster.harmonize_gpu(nodename, gpu_type) for nodename in entry.nodes
}
# If present, remove None from GPU names
harmonized_gpu_names.discard(None)
# If we got 1 GPU name, use it.
# Otherwise, keep default found gpu_type.
if len(harmonized_gpu_names) == 1:
gpu_type = harmonized_gpu_names.pop()
# Finally, save gpu_type into job object.
entry.allocated.gpu_type = gpu_type

return entry.allocated.gpu_type
2 changes: 1 addition & 1 deletion sarc/users/revision.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
- All current documents have NO end date
- this makes query simple as we can just look for missing end date,
because previous db had no revision system none of the documents have
end dates, so all their documents are the current ones
end dates, so all their documents are the current ones

- All past version have an end date
"""
Expand Down
2 changes: 1 addition & 1 deletion tests/functional/jobs/test_func_sacct.py
Original file line number Diff line number Diff line change
Expand Up @@ -472,7 +472,7 @@ def test_get_gpu_type_without_prometheus(
job = jobs[0]
print(job)
print(job.nodes)
assert job.allocated.gpu_type == "gpu:asupergpu:4"
assert job.allocated.gpu_type == "Nec Plus ULTRA GPU 2000"

file_regression.check(
f"Found {len(jobs)} job(s):\n"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ Found 1 job(s):
"node": 1,
"billing": 1,
"gres_gpu": 1,
"gpu_type": "gpu:asupergpu:4"
"gpu_type": "Nec Plus ULTRA GPU 2000"
},
"stored_statistics": null
}
}
10 changes: 9 additions & 1 deletion tests/sarc-test.json
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,15 @@
"duc_inodes_command": null,
"duc_storage_command": null,
"diskusage_report_command": null,
"prometheus_url": null
"prometheus_url": null,
"gpus_per_nodes" : {
"cn-c018": {
"asupergpu": "Nec Plus Plus ULTRA GPU 2000"
},
"cn-c[019-030]": {
"asupergpu": "Nec Plus ULTRA GPU 2000"
}
}
},
"fromage": {
"host": "fromage",
Expand Down
70 changes: 70 additions & 0 deletions tests/unittests/jobs/test_harmonize_gpu.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
import pytest

from sarc.config import DEFAULTS_FLAG, MIG_FLAG, ClusterConfig, config

GPUS_PER_NODES = {
"node[0-9]": {"gpu1": "DESCRIPTIVE GPU 1"},
"node[9-19]": {"gpu2": "DESCRIPTIVE GPU 2"},
"node_mig20": {"gpu3": "DESCRIPTIVE GPU 3", "4g.40gb": f"{MIG_FLAG}gpu3"},
DEFAULTS_FLAG: {"gpu_default": "DESCRIPTIVE GPU DEFAULT"},
}


@pytest.mark.parametrize(
"node,gpu_type,expected,gpus_per_nodes",
[
[
"DoesNotExist",
"DoesNotExist",
None,
{},
],
[
"node1",
"GPU1",
"DESCRIPTIVE GPU 1",
GPUS_PER_NODES,
],
[
"node11",
"GPU2",
"DESCRIPTIVE GPU 2",
GPUS_PER_NODES,
],
[
"DoesNotExist",
"GPU_DEFAULT",
"DESCRIPTIVE GPU DEFAULT",
GPUS_PER_NODES,
],
[
"node1",
"DoesNotExist",
None,
GPUS_PER_NODES,
],
[
"node_mig20",
"4g.40gb",
"DESCRIPTIVE GPU 3 : 4g.40gb",
GPUS_PER_NODES,
],
],
)
def test_harmonize_gpu(node, gpu_type, expected, gpus_per_nodes):
cluster = ClusterConfig(timezone="America/Montreal", gpus_per_nodes=gpus_per_nodes)
assert cluster.harmonize_gpu(node, gpu_type) == expected


@pytest.mark.usefixtures("standard_config")
@pytest.mark.parametrize(
"node,gpu_type,expected",
[
("cn-c018", "asupergpu", "Nec Plus Plus ULTRA GPU 2000"),
("cn-c019", "asupergpu", "Nec Plus ULTRA GPU 2000"),
("cn-c024", "asupergpu", "Nec Plus ULTRA GPU 2000"),
],
)
def test_clusterconfig_harmonize_gpu(node, gpu_type, expected):
cluster = config().clusters["raisin_no_prometheus"]
assert cluster.harmonize_gpu(node, gpu_type) == expected
Loading