Skip to content

Commit

Permalink
Harmonize gres across clusters
Browse files Browse the repository at this point in the history
  • Loading branch information
satyaog committed Apr 2, 2024
1 parent 167a4de commit 262d37c
Show file tree
Hide file tree
Showing 8 changed files with 101 additions and 11 deletions.
6 changes: 5 additions & 1 deletion sarc/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,8 @@ class ClusterConfig(BaseModel):
duc_storage_command: str = None
diskusage_report_command: str = None
start_date: str = "2022-04-01"
gpus: list = []
harmonize_gpu_map: dict = {}

@validator("timezone")
def _timezone(cls, value):
Expand Down Expand Up @@ -135,7 +137,9 @@ def node_to_gpu(self):
"""
from .jobs.node_gpu_mapping import NodeToGPUMapping

return NodeToGPUMapping(self.name, self.nodes_info_file)
return NodeToGPUMapping(
self.name, self.nodes_info_file, self.harmonize_gpu_map, self.gpus
)


class MongoConfig(BaseModel):
Expand Down
27 changes: 24 additions & 3 deletions sarc/jobs/node_gpu_mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,18 +8,26 @@
import json
import os

import regex as re
from hostlist import expand_hostlist


class NodeToGPUMapping:
"""Helper class to generate JSON file, load it in memory, and query GPU type for a nodename."""

def __init__(self, cluster_name, nodes_info_file):
def __init__(self, cluster_name, nodes_info_file, harmonize_gpu_map, gpus):
"""Initialize with cluster name and TXT file path to parse."""

# Mapping is empty by default.
self.mapping = {}
self.json_path = None
self.harmonize_gpu_map = {
**{
re.compile(regex): gpu_type
for regex, gpu_type in harmonize_gpu_map.items()
},
**{re.compile(f".*{gpu}.*"): gpu for gpu in gpus},
}

# Mapping is filled only if TXT file is available.
if nodes_info_file and os.path.exists(nodes_info_file):
Expand All @@ -36,7 +44,7 @@ def __init__(self, cluster_name, nodes_info_file):
not os.path.exists(self.json_path)
or os.stat(self.json_path).st_mtime < info_file_stat.st_mtime
):
# Pase TXT file into self.mapping.
# Parse TXT file into self.mapping.
self._parse_nodenames(nodes_info_file, self.mapping)
# Save self.mapping into JSON file.
with open(self.json_path, "w", encoding="utf-8") as file:
Expand All @@ -46,9 +54,22 @@ def __init__(self, cluster_name, nodes_info_file):
with open(self.json_path, encoding="utf-8") as file:
self.mapping = json.load(file)

def _harmonize_gpu(self, gpu_type: str):
gpu_type = gpu_type.lower().replace(" ", "-").split(":")
if gpu_type[0] == "gpu":
gpu_type.pop(0)
gpu_type = gpu_type[0]
for regex, harmonized_gpu in self.harmonize_gpu_map.items():
if regex.match(gpu_type):
break
else:
harmonized_gpu = None
return harmonized_gpu

def __getitem__(self, nodename):
"""Return GPU type for nodename, or None if not found."""
return self.mapping.get(nodename, None)
gpu_type = self.mapping.get(nodename, None)
return self._harmonize_gpu(gpu_type)

@staticmethod
def _parse_nodenames(path: str, output: dict):
Expand Down
15 changes: 11 additions & 4 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
import json
import os
import shutil
import sys
import tempfile
import zoneinfo
from pathlib import Path
from unittest.mock import MagicMock, mock_open, patch
from unittest.mock import MagicMock, mock_open

from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import SimpleSpanProcessor
from opentelemetry.sdk.trace.export.in_memory_span_exporter import InMemorySpanExporter
from opentelemetry.trace import get_tracer_provider, set_tracer_provider
from opentelemetry.trace import set_tracer_provider

_tracer_provider = TracerProvider()
_exporter = InMemorySpanExporter()
Expand All @@ -24,7 +24,6 @@
from sarc.config import (
ClusterConfig,
Config,
MongoConfig,
ScraperConfig,
config,
parse_config,
Expand Down Expand Up @@ -73,6 +72,14 @@ def disabled_cache():
yield


# Make sure the cache dir is empty before running the tests
@pytest.fixture(scope="session", autouse=True)
def clean_up_test_cache_before_run(standard_config_object):
if standard_config_object.cache.exists():
shutil.rmtree(str(standard_config_object.cache))
yield


@pytest.fixture
def tzlocal_is_mtl(monkeypatch):
monkeypatch.setattr("sarc.config.TZLOCAL", zoneinfo.ZoneInfo("America/Montreal"))
Expand Down
2 changes: 1 addition & 1 deletion tests/functional/jobs/test_func_sacct.py
Original file line number Diff line number Diff line change
Expand Up @@ -445,7 +445,7 @@ def test_get_gpu_type_without_prometheus(
job = jobs[0]
print(job)
print(job.nodes)
assert job.allocated.gpu_type == "gpu:asupergpu:4"
assert job.allocated.gpu_type == "asupergpu"

file_regression.check(
f"Found {len(jobs)} job(s):\n"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ Found 1 job(s):
"node": 1,
"billing": 1,
"gres_gpu": 1,
"gpu_type": "gpu:asupergpu:4"
"gpu_type": "asupergpu"
},
"stored_statistics": null
}
16 changes: 16 additions & 0 deletions tests/functional/test_clusterconfig.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import pytest

from sarc.config import config


@pytest.mark.usefixtures("standard_config")
def test_clusterconfig_node_to_gpu():
cluster_config = config().clusters["raisin_no_prometheus"]
mapping = cluster_config.node_to_gpu

result = mapping["cn-c018"]
assert result in cluster_config.gpus
assert (
mapping._harmonize_gpu(f"{cluster_config.gpus[0]}_suffix")
== cluster_config.gpus[0]
)
6 changes: 5 additions & 1 deletion tests/sarc-test.json
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,11 @@
"duc_storage_command": null,
"diskusage_report_command": null,
"prometheus_url": null,
"nodes_info_file": "tests/not-so-secrets/raisin_no_prometheus/nodes_raisin_no_prometheus.txt"
"nodes_info_file": "tests/not-so-secrets/raisin_no_prometheus/nodes_raisin_no_prometheus.txt",
"gpus": ["asupergpu"],
"harmonize_gpu_map": {
".*asupergpu_suffix.*": "asupergpu"
}
},
"fromage": {
"host": "fromage",
Expand Down
38 changes: 38 additions & 0 deletions tests/unittests/jobs/test_node_to_gpu_mapping.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import pytest

from sarc.jobs.node_gpu_mapping import NodeToGPUMapping


@pytest.mark.parametrize(
"gpu_type,expected,harmonize_gpu_map,gpus",
[
[
"DoesNotExist",
None,
{},
[],
],
[
"prefix GPU1:suffix",
"gpu1",
{},
["gpu1", "gpu2"],
],
[
"prefix GPU2 suffix",
"gpu2",
{},
["gpu1", "gpu2"],
],
[
"prefix GPU1_suffix",
"gpu1",
{".*gpu1_suffix.*": "gpu1"},
["gpu1", "gpu2"],
],
],
)
def test_node_to_gpu_mapping(gpu_type, expected, harmonize_gpu_map, gpus):
mapping = NodeToGPUMapping("cluster", None, harmonize_gpu_map, gpus)

assert mapping._harmonize_gpu(gpu_type) == expected

0 comments on commit 262d37c

Please sign in to comment.