feat!: improved GPU job support (#173)

cmeesters · johanneskoester · coderabbitai[bot] · web-flow · commit 66dcdcfed1b2 · 2025-03-11T13:44:59.000+01:00
In the light of more and more accelerator applications (AI, base mapping, ...) the fall-back onto `slurm_extra` becomes a bit tedious to use. Hence, the resource support for `gres`. Addresses issue #52 (and to a minor extent: #18 and #104). Supersedes PR #172 .  ## Summary by CodeRabbit - **New Features** - Updated documentation section on "GPU Jobs" to clarify how to request GPU resources with new syntax examples. - **Bug Fixes** - Improved error handling and reporting for job submission processes. - Clarified error messages in test cases for better understanding. - **Dependency Updates** - Updated `snakemake-executor-plugin-slurm-jobstep` dependency version from `^0.2.0` to `^0.3.0`. - **Tests** - Streamlined test cases by removing less relevant tests and enhancing clarity of error messages.  --------- Co-authored-by: Johannes Köster <johannes.koester@uni-due.de> Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com>
diff --git a/docs/further.md b/docs/further.md
@@ -88,6 +88,36 @@ $ snakemake --set-resources calc_pi:mpi="mpiexec" ...
 
 To submit "ordinary" MPI jobs, submitting with `tasks` (the MPI ranks) is sufficient. Alternatively, on some clusters, it might be convenient to just configure `nodes`. Consider using a combination of `tasks` and `cpus_per_task` for hybrid applications (those that use ranks (multiprocessing) and threads). A detailed topology layout can be achieved using the `slurm_extra` parameter (see below) using further flags like `--distribution`.
 
+### GPU Jobs
+
+SLURM allows to specify GPU request with the `--gres` or `--gpus` flags and Snakemake takes a similar approach. Resources can be asked for with
+
+- The resource `gpu` can be used, e.g. by just requesting the number of GPUs like `gpu=2`. This can be combined with the `gpu_model` resource, i.e. `gpu_model=a100` or independently. The combination will result in a flag to `sbatch` like `--gpus=a100:2`. The Snakemake `gpu` resource has to be number. 
+- Alternatively, the resource `gres`, the syntax is `<string>:<number>` or `<string>:<model>:<number>`, i.e. `gres=gpu:1` or `gres=gpu:a100:2` (assuming GPU model).
+
+.. note:: Internally, Snakemake knows the resource `gpu_manufacturer`, too. However, SLURM does not know the distinction between model and manufacturer. Essentially, the preferred way to request an accelerator will depend on your specific cluster setup.
+    Also, to be consistent within Snakemake, the resource is called `gpu` not `gpus`.
+
+Additionally, `cpus_per_gpu` can be set - Snakemakes `threads` settings will otherwise be used to set `cpus_per_gpu`. If `cpus_per_gpu` is lower or equal to zero, no CPU is requested from SLURM (and cluster defaults will kick in, if any).
+
+A sample workflow profile might look like:
+
+```YAML
+set-resources:
+    gres_request_rule:
+        gres: "gpu:1"
+
+    multi_gpu_rule:
+        gpu: 2
+        gpu_model: "a100"
+        cpus_per_gpu: 4
+
+    no_cpu_gpu_rule:
+        gpu: 1
+        cpus_per_gpu: 0 # Values <= 0 indicate that NO CPU request string
+                        # will be issued.
+```
+
 ### Running Jobs locally
 
 Not all Snakemake workflows are adapted for heterogeneous environments, particularly clusters. Users might want to avoid the submission of _all_ rules as cluster jobs. Non-cluster jobs should usually include _short_ jobs, e.g. internet downloads or plotting rules.
diff --git a/pyproject.toml b/pyproject.toml
@@ -17,7 +17,7 @@ keywords = ["snakemake", "plugin", "executor", "cluster", "slurm"]
 python = "^3.11"
 snakemake-interface-common = "^1.13.0"
 snakemake-interface-executor-plugins = "^9.1.1"
-snakemake-executor-plugin-slurm-jobstep = "^0.2.0"
+snakemake-executor-plugin-slurm-jobstep = "^0.3.0"
 throttler = "^1.2.2"
 
 [tool.poetry.group.dev.dependencies]
diff --git a/snakemake_executor_plugin_slurm/__init__.py b/snakemake_executor_plugin_slurm/__init__.py
@@ -26,9 +26,9 @@
     JobExecutorInterface,
 )
 from snakemake_interface_common.exceptions import WorkflowError
-from snakemake_executor_plugin_slurm_jobstep import get_cpus_per_task
+from snakemake_executor_plugin_slurm_jobstep import get_cpu_setting
 
-from .utils import delete_slurm_environment, delete_empty_dirs
+from .utils import delete_slurm_environment, delete_empty_dirs, set_gres_string
 
 
 @dataclass
@@ -217,6 +217,8 @@ def run_job(self, job: JobExecutorInterface):
         if self.workflow.executor_settings.requeue:
             call += " --requeue"
 
+        call += set_gres_string(job)
+
         if job.resources.get("clusters"):
             call += f" --clusters {job.resources.clusters}"
 
@@ -247,7 +249,11 @@ def run_job(self, job: JobExecutorInterface):
 
         # fixes #40 - set ntasks regardless of mpi, because
         # SLURM v22.05 will require it for all jobs
-        call += f" --ntasks={job.resources.get('tasks', 1)}"
+        gpu_job = job.resources.get("gpu") or "gpu" in job.resources.get("gres", "")
+        if gpu_job:
+            call += f" --ntasks-per-gpu={job.resources.get('tasks', 1)}"
+        else:
+            call += f" --ntasks={job.resources.get('tasks', 1)}"
         # MPI job
         if job.resources.get("mpi", False):
             if not job.resources.get("tasks_per_node") and not job.resources.get(
@@ -258,8 +264,9 @@ def run_job(self, job: JobExecutorInterface):
                     "specified. Assuming 'tasks_per_node=1'."
                     "Probably not what you want."
                 )
-
-        call += f" --cpus-per-task={get_cpus_per_task(job)}"
+        # we need to set cpus-per-task OR cpus-per-gpu, the function
+        # will return a string with the corresponding value
+        call += f" {get_cpu_setting(job, gpu_job)}"
 
         if job.resources.get("slurm_extra"):
             self.check_slurm_extra(job)
diff --git a/snakemake_executor_plugin_slurm/utils.py b/snakemake_executor_plugin_slurm/utils.py
@@ -1,8 +1,14 @@
 # utility functions for the SLURM executor plugin
 
 import os
+import re
 from pathlib import Path
 
+from snakemake_interface_executor_plugins.jobs import (
+    JobExecutorInterface,
+)
+from snakemake_interface_common.exceptions import WorkflowError
+
 
 def delete_slurm_environment():
     """
@@ -40,3 +46,59 @@ def delete_empty_dirs(path: Path) -> None:
     except (OSError, FileNotFoundError) as e:
         # Provide more context in the error message
         raise OSError(f"Failed to remove empty directory {path}: {e}") from e
+
+
+def set_gres_string(job: JobExecutorInterface) -> str:
+    """
+    Function to set the gres string for the SLURM job
+    based on the resources requested in the job.
+    """
+    # generic resources (GRES) arguments can be of type
+    # "string:int" or "string:string:int"
+    gres_re = re.compile(r"^[a-zA-Z0-9_]+(:[a-zA-Z0-9_]+)?:\d+$")
+    # gpu model arguments can be of type "string"
+    gpu_model_re = re.compile(r"^[a-zA-Z0-9_]+$")
+    # The Snakemake resources can be only be of type "int",
+    # hence no further regex is needed.
+
+    gpu_string = None
+    if job.resources.get("gpu"):
+        gpu_string = str(job.resources.get("gpu"))
+
+    gpu_model = None
+    if job.resources.get("gpu_model"):
+        gpu_model = job.resources.get("gpu_model")
+
+    # ensure that gres is not set, if gpu and gpu_model are set
+    if job.resources.get("gres") and gpu_string:
+        raise WorkflowError(
+            "GRES and GPU are set. Please only set one of them.", rule=job.rule
+        )
+    elif not job.resources.get("gres") and not gpu_model and not gpu_string:
+        return ""
+
+    if job.resources.get("gres"):
+        # Validate GRES format (e.g., "gpu:1", "gpu:tesla:2")
+        gres = job.resources.gres
+        if not gres_re.match(gres):
+            raise WorkflowError(
+                f"Invalid GRES format: {gres}. Expected format: "
+                "'<name>:<number>' or '<name>:<type>:<number>' "
+                "(e.g., 'gpu:1' or 'gpu:tesla:2')"
+            )
+        return f" --gres={job.resources.gres}"
+
+    if gpu_model and gpu_string:
+        # validate GPU model format
+        if not gpu_model_re.match(gpu_model):
+            raise WorkflowError(
+                f"Invalid GPU model format: {gpu_model}."
+                " Expected format: '<name>' (e.g., 'tesla')"
+            )
+        return f" --gpus={gpu_model}:{gpu_string}"
+    elif gpu_model and not gpu_string:
+        raise WorkflowError("GPU model is set, but no GPU number is given")
+    elif gpu_string:
+        # we assume here, that the validator ensures that the 'gpu_string'
+        # is an integer
+        return f" --gpus={gpu_string}"
diff --git a/tests/tests.py b/tests/tests.py
@@ -1,8 +1,12 @@
 from typing import Optional
 import snakemake.common.tests
 from snakemake_interface_executor_plugins.settings import ExecutorSettingsBase
+from unittest.mock import MagicMock
+import pytest
 
 from snakemake_executor_plugin_slurm import ExecutorSettings
+from snakemake_executor_plugin_slurm.utils import set_gres_string
+from snakemake_interface_common.exceptions import WorkflowError
 
 
 class TestWorkflows(snakemake.common.tests.TestWorkflowsLocalStorageBase):
@@ -18,3 +22,91 @@ def get_executor_settings(self) -> Optional[ExecutorSettingsBase]:
 class TestWorkflowsRequeue(TestWorkflows):
     def get_executor_settings(self) -> Optional[ExecutorSettingsBase]:
         return ExecutorSettings(requeue=True)
+
+
+class TestGresString:
+    """Test cases for the set_gres_string function."""
+
+    @pytest.fixture
+    def mock_job(self):
+        """Create a mock job with configurable resources."""
+
+        def _create_job(**resources):
+            mock_resources = MagicMock()
+            # Configure get method to return values from resources dict
+            mock_resources.get.side_effect = lambda key, default=None: resources.get(
+                key, default
+            )
+            # Add direct attribute access for certain resources
+            for key, value in resources.items():
+                setattr(mock_resources, key, value)
+
+            mock_job = MagicMock()
+            mock_job.resources = mock_resources
+            return mock_job
+
+        return _create_job
+
+    def test_no_gres_or_gpu(self, mock_job):
+        """Test with no GPU or GRES resources specified."""
+        job = mock_job()
+        assert set_gres_string(job) == ""
+
+    def test_valid_gres_simple(self, mock_job):
+        """Test with valid GRES format (simple)."""
+        job = mock_job(gres="gpu:1")
+        assert set_gres_string(job) == " --gres=gpu:1"
+
+    def test_valid_gres_with_model(self, mock_job):
+        """Test with valid GRES format including GPU model."""
+        job = mock_job(gres="gpu:tesla:2")
+        assert set_gres_string(job) == " --gres=gpu:tesla:2"
+
+    def test_invalid_gres_format(self, mock_job):
+        """Test with invalid GRES format."""
+        job = mock_job(gres="gpu")
+        with pytest.raises(WorkflowError, match="Invalid GRES format"):
+            set_gres_string(job)
+
+    def test_invalid_gres_format_missing_count(self, mock_job):
+        """Test with invalid GRES format (missing count)."""
+        job = mock_job(gres="gpu:tesla:")
+        with pytest.raises(WorkflowError, match="Invalid GRES format"):
+            set_gres_string(job)
+
+    def test_valid_gpu_number(self, mock_job):
+        """Test with valid GPU number."""
+        job = mock_job(gpu="2")
+        assert set_gres_string(job) == " --gpus=2"
+
+    def test_valid_gpu_with_name(self, mock_job):
+        """Test with valid GPU name and number."""
+        job = mock_job(gpu="tesla:2")
+        assert set_gres_string(job) == " --gpus=tesla:2"
+
+    def test_gpu_with_model(self, mock_job):
+        """Test GPU with model specification."""
+        job = mock_job(gpu="2", gpu_model="tesla")
+        assert set_gres_string(job) == " --gpus=tesla:2"
+
+    def test_invalid_gpu_model_format(self, mock_job):
+        """Test with invalid GPU model format."""
+        job = mock_job(gpu="2", gpu_model="invalid:model")
+        with pytest.raises(WorkflowError, match="Invalid GPU model format"):
+            set_gres_string(job)
+
+    def test_gpu_model_without_gpu(self, mock_job):
+        """Test GPU model without GPU number."""
+        job = mock_job(gpu_model="tesla")
+        with pytest.raises(
+            WorkflowError, match="GPU model is set, but no GPU number is given"
+        ):
+            set_gres_string(job)
+
+    def test_both_gres_and_gpu_set(self, mock_job):
+        """Test error case when both GRES and GPU are specified."""
+        job = mock_job(gres="gpu:1", gpu="2")
+        with pytest.raises(
+            WorkflowError, match="GRES and GPU are set. Please only set one of them."
+        ):
+            set_gres_string(job)