Skip to content

Commit

Permalink
drop deepmd.tf.cluster.slurm (#3239)
Browse files Browse the repository at this point in the history
The `deepmd.tf.cluster.slurm` is too specialized - we are not able to
support every cluster. This PR uses mpi4py as the alternatives,
considering the documentation has asked users to install mpi4py.

---------

Signed-off-by: Jinzhe Zeng <[email protected]>
  • Loading branch information
njzjz authored Feb 6, 2024
1 parent 1c8d635 commit 6672a28
Show file tree
Hide file tree
Showing 6 changed files with 39 additions and 142 deletions.
7 changes: 1 addition & 6 deletions deepmd/tf/cluster/__init__.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,13 @@
# SPDX-License-Identifier: LGPL-3.0-or-later
"""Module that reads node resources, auto detects if running local or on SLURM."""

import os
from typing import (
List,
Optional,
Tuple,
)

from .local import get_resource as get_local_res
from .slurm import get_resource as get_slurm_res

__all__ = ["get_resource"]

Expand All @@ -22,7 +20,4 @@ def get_resource() -> Tuple[str, List[str], Optional[List[int]]]:
Tuple[str, List[str], Optional[List[int]]]
nodename, nodelist, and gpus
"""
if "SLURM_JOB_NODELIST" in os.environ:
return get_slurm_res()
else:
return get_local_res()
return get_local_res()
7 changes: 4 additions & 3 deletions deepmd/tf/cluster/local.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
# SPDX-License-Identifier: LGPL-3.0-or-later
"""Get local GPU resources."""

import socket
import subprocess as sp
import sys
from typing import (
Expand All @@ -13,6 +12,9 @@
from deepmd.tf.env import (
tf,
)
from deepmd.utils.hostlist import (
get_host_names,
)

__all__ = ["get_gpus", "get_resource"]

Expand Down Expand Up @@ -57,7 +59,6 @@ def get_resource() -> Tuple[str, List[str], Optional[List[int]]]:
Tuple[str, List[str], Optional[List[int]]]
nodename, nodelist, and gpus
"""
nodename = socket.gethostname()
nodelist = [nodename]
nodename, nodelist = get_host_names()
gpus = get_gpus()
return nodename, nodelist, gpus
59 changes: 0 additions & 59 deletions deepmd/tf/cluster/slurm.py

This file was deleted.

34 changes: 34 additions & 0 deletions deepmd/utils/hostlist.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# SPDX-License-Identifier: LGPL-3.0-or-later
import socket
from typing import (
List,
Tuple,
)


def get_host_names() -> Tuple[str, List[str]]:
"""Get host names of all nodes in the cluster.
If mpi4py is not installed or MPI is not used, then the
host name of the current node is returned as those of all nodes.
Returns
-------
str
Host name of the current node
List[str]
List of host names of all nodes in the cluster
"""
host_name = socket.gethostname()
try:
from mpi4py import (
MPI,
)
except ImportError:
return host_name, [host_name]

comm = MPI.COMM_WORLD
if comm.Get_size() == 1:
return host_name, [host_name]
host_names = comm.allgather(host_name)
return host_name, list(set(host_names))
1 change: 0 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,6 @@ dependencies = [
'scipy',
'pyyaml',
'dargs >= 0.4.1',
'python-hostlist >= 1.21',
'typing_extensions; python_version < "3.8"',
'importlib_metadata>=1.4; python_version < "3.8"',
'h5py',
Expand Down
73 changes: 0 additions & 73 deletions source/tests/tf/test_cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@

from deepmd.tf.cluster import (
local,
slurm,
)

kHostName = "compute-b24-1"
Expand Down Expand Up @@ -70,75 +69,3 @@ def test_resource(self, mock_gethostname):
nodename, nodelist, _ = local.get_resource()
self.assertEqual(nodename, kHostName)
self.assertEqual(nodelist, [kHostName])


class TestSlurm(unittest.TestCase):
@mock.patch.dict(
"os.environ",
values={
"SLURM_JOB_NODELIST": kHostName,
"SLURMD_NODENAME": kHostName,
"SLURM_JOB_NUM_NODES": "1",
},
)
def test_single(self):
nodename, nodelist, _ = slurm.get_resource()
self.assertEqual(nodename, kHostName)
self.assertEqual(nodelist, [kHostName])

@mock.patch.dict(
"os.environ",
values={
"SLURM_JOB_NODELIST": "compute-b24-[1-3,5-9],compute-b25-[4,8]",
"SLURMD_NODENAME": "compute-b24-2",
"SLURM_JOB_NUM_NODES": "10",
},
)
def test_multiple(self):
nodename, nodelist, _ = slurm.get_resource()
self.assertEqual(nodename, "compute-b24-2")
self.assertEqual(
nodelist,
[
"compute-b24-1",
"compute-b24-2",
"compute-b24-3",
"compute-b24-5",
"compute-b24-6",
"compute-b24-7",
"compute-b24-8",
"compute-b24-9",
"compute-b25-4",
"compute-b25-8",
],
)

def test_illegal(self):
environ = {
"SLURM_JOB_NODELIST": "compute-b24-[3-5]",
"SLURMD_NODENAME": "compute-b24-4",
}
with mock.patch.dict("os.environ", environ):
with self.assertRaises(RuntimeError) as cm:
_ = slurm.get_resource()
self.assertIn("Could not get SLURM number", str(cm.exception))

environ = {
"SLURM_JOB_NODELIST": "compute-b24-1,compute-b25-2",
"SLURMD_NODENAME": "compute-b25-2",
"SLURM_JOB_NUM_NODES": "4",
}
with mock.patch.dict("os.environ", environ):
with self.assertRaises(ValueError) as cm:
_ = slurm.get_resource()
self.assertIn("Number of slurm nodes 2", str(cm.exception))

environ = {
"SLURM_JOB_NODELIST": "compute-b24-1,compute-b25-3",
"SLURMD_NODENAME": "compute-b25-2",
"SLURM_JOB_NUM_NODES": "2",
}
with mock.patch.dict("os.environ", environ):
with self.assertRaises(ValueError) as cm:
_ = slurm.get_resource()
self.assertIn("Nodename(compute-b25-2", str(cm.exception))

0 comments on commit 6672a28

Please sign in to comment.