Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Expose first_device parameter for setting models, scripts, functions #394

Merged
merged 7 commits into from
Oct 12, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 8 additions & 3 deletions doc/changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,16 @@ To be released at some future point in time

Description

- Expose first_device parameter for scripts, functions, models
- Added support for MINBATCHTIMEOUT in model execution
- Remove support for RedisAI 1.2.5, use RedisAI 1.2.7 commit
- Add support for multiple databases

Detailed Notes

- Added support for first_device parameter for scripts, functions,
and models. This causes them to be loaded to the first num_devices
beginning with first_device (PR394_)
- Added support for MINBATCHTIMEOUT in model execution, which caps the delay
waiting for a minimium number of model execution operations to accumulate
before executing them as a batch (PR387_)
Expand All @@ -32,10 +37,13 @@ Detailed Notes
bug which breaks the build process on Mac OSX, it was decided to
use commit 634916c_ from RedisAI's GitHub repository, where such
bug has been fixed. This applies to all operating systems. (PR383_)
- Add support for creation of multiple databases with unique identifiers. (PR342_)

.. _PR394: https://github.com/CrayLabs/SmartSim/pull/394
.. _PR387: https://github.com/CrayLabs/SmartSim/pull/387
.. _PR383: https://github.com/CrayLabs/SmartSim/pull/383
.. _634916c: https://github.com/RedisAI/RedisAI/commit/634916c722e718cc6ea3fad46e63f7d798f9adc2
.. _PR342: https://github.com/CrayLabs/SmartSim/pull/342

0.5.1
-----
Expand All @@ -44,7 +52,6 @@ Released on 14 September, 2023

Description

- Add support for multiple databases
- Add typehints throughout the SmartSim codebase
- Provide support for Slurm heterogeneous jobs
- Provide better support for `PalsMpiexecSettings`
Expand All @@ -62,7 +69,6 @@ Description

Detailed Notes

- Add support for creation of multiple databases with unique identifiers. (PR342_)
- Add methods to allow users to inspect files attached to models and ensembles. (PR352_)
- Add a `smart info` target to provide rudimentary information about the SmartSim installation. (PR350_)
- Remove unnecessary generation producing unexpected directories in the test suite. (PR349_)
Expand All @@ -86,7 +92,6 @@ Detailed Notes
- Update pylint dependency, update .pylintrc, mitigate non-breaking issues, suppress api breaks. (PR311_)
- Refactor the `smart` CLI to use subparsers for better documentation and extension. (PR308_)

.. _PR342: https://github.com/CrayLabs/SmartSim/pull/342
.. _PR352: https://github.com/CrayLabs/SmartSim/pull/352
.. _PR351: https://github.com/CrayLabs/SmartSim/pull/351
.. _PR350: https://github.com/CrayLabs/SmartSim/pull/350
Expand Down
14 changes: 9 additions & 5 deletions smartsim/_core/entrypoints/colocated.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,8 @@ def launch_db_model(client: Client, db_model: t.List[str]) -> str:
parser.add_argument("--file", type=str)
parser.add_argument("--backend", type=str)
parser.add_argument("--device", type=str)
parser.add_argument("--devices_per_node", type=int)
parser.add_argument("--devices_per_node", type=int, default=1)
parser.add_argument("--first_device", type=int, default=0)
parser.add_argument("--batch_size", type=int, default=0)
parser.add_argument("--min_batch_size", type=int, default=0)
parser.add_argument("--min_batch_timeout", type=int, default=0)
Expand All @@ -100,7 +101,7 @@ def launch_db_model(client: Client, db_model: t.List[str]) -> str:
name=name,
model_file=args.file,
backend=args.backend,
fist_gpu=0,
first_gpu=args.first_device,
num_gpus=args.devices_per_node,
batch_size=args.batch_size,
min_batch_size=args.min_batch_size,
Expand Down Expand Up @@ -142,7 +143,8 @@ def launch_db_script(client: Client, db_script: t.List[str]) -> str:
parser.add_argument("--file", type=str)
parser.add_argument("--backend", type=str)
parser.add_argument("--device", type=str)
parser.add_argument("--devices_per_node", type=int)
parser.add_argument("--devices_per_node", type=int, default=1)
parser.add_argument("--first_device", type=int, default=0)
args = parser.parse_args(db_script)

if args.file and args.func:
Expand All @@ -151,13 +153,15 @@ def launch_db_script(client: Client, db_script: t.List[str]) -> str:
if args.func:
func = args.func.replace("\\n", "\n")
if args.devices_per_node > 1 and args.device.lower() == "gpu":
client.set_script_multigpu(args.name, func, 0, args.devices_per_node)
client.set_script_multigpu(
args.name, func, args.first_device, args.devices_per_node
)
else:
client.set_script(args.name, func, args.device)
elif args.file:
if args.devices_per_node > 1 and args.device.lower() == "gpu":
client.set_script_from_file_multigpu(
args.name, args.file, 0, args.devices_per_node
args.name, args.file, args.first_device, args.devices_per_node
)
else:
client.set_script_from_file(args.name, args.file, args.device)
Expand Down
3 changes: 2 additions & 1 deletion smartsim/_core/launcher/colocated.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,7 @@ def _build_db_model_cmd(db_models: t.List[DBModel]) -> t.List[str]:
cmd.append(f"--backend={db_model.backend}")
cmd.append(f"--device={db_model.device}")
cmd.append(f"--devices_per_node={db_model.devices_per_node}")
cmd.append(f"--first_device={db_model.first_device}")
if db_model.batch_size:
cmd.append(f"--batch_size={db_model.batch_size}")
if db_model.min_batch_size:
Expand Down Expand Up @@ -254,5 +255,5 @@ def _build_db_script_cmd(db_scripts: t.List[DBScript]) -> t.List[str]:
cmd.append(f"--file={db_script.file}")
cmd.append(f"--device={db_script.device}")
cmd.append(f"--devices_per_node={db_script.devices_per_node}")

cmd.append(f"--first_device={db_script.first_device}")
return cmd
47 changes: 36 additions & 11 deletions smartsim/entity/dbobject.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ def __init__(
file_path: t.Optional[str],
device: t.Literal["CPU", "GPU"],
devices_per_node: int,
first_device: int,
) -> None:
self.name = name
self.func = func
Expand All @@ -56,7 +57,8 @@ def __init__(
self.file = self._check_filepath(file_path)
self.device = self._check_device(device)
self.devices_per_node = devices_per_node
self._check_devices(device, devices_per_node)
self.first_device = first_device
self._check_devices(device, devices_per_node, first_device)

@property
def devices(self) -> t.List[str]:
Expand Down Expand Up @@ -118,16 +120,28 @@ def _enumerate_devices(self) -> t.List[str]:

if self.device == "GPU" and self.devices_per_node > 1:
return [
f"{self.device}:{str(device_num)}"
for device_num in range(self.devices_per_node)
f"{self.device}:{device_num}"
for device_num in range(
self.first_device, self.first_device + self.devices_per_node
)
]

return [self.device]

@staticmethod
def _check_devices(
device: t.Literal["CPU", "GPU"], devices_per_node: int
device: t.Literal["CPU", "GPU"], devices_per_node: int, first_device: int,
) -> None:
if device == "CPU" and devices_per_node > 1:
raise SSUnsupportedError(
"Cannot set devices_per_node>1 if CPU is specified under devices"
)

if device == "CPU" and first_device > 0:
raise SSUnsupportedError(
"Cannot set first_device>0 if CPU is specified under devices"
)

if devices_per_node == 1:
return

Expand All @@ -136,10 +150,6 @@ def _check_devices(
msg += f"the device was set to {device} and \
devices_per_node=={devices_per_node}"
raise ValueError(msg)
if device == "CPU":
raise SSUnsupportedError(
"Cannot set devices_per_node>1 if CPU is specified under devices"
)


class DBScript(DBObject):
Expand All @@ -150,14 +160,17 @@ def __init__(
script_path: t.Optional[str] = None,
device: t.Literal["CPU", "GPU"] = "CPU",
devices_per_node: int = 1,
first_device: int = 0,
):
"""TorchScript code represenation

Device selection is either "GPU" or "CPU". If many devices are
present, a number can be passed for specification e.g. "GPU:1".

Setting ``devices_per_node=N``, with N greater than one will result
in the model being stored on the first N devices of type ``device``.
in the script being stored on the first N devices of type ``device``;
additionally setting ``first_device=M`` will instead result in the
script being stored on devices M through M + N -1.

One of either script (in memory representation) or script_path (file)
must be provided
Expand All @@ -172,8 +185,12 @@ def __init__(
:type device: str, optional
:param devices_per_node: number of devices to store the script on
:type devices_per_node: int
:param first_device: first devices to store the script on
:type first_device: int
"""
super().__init__(name, script, script_path, device, devices_per_node)
super().__init__(
name, script, script_path, device, devices_per_node, first_device
)
if not script and not script_path:
raise ValueError("Either script or script_path must be provided")

Expand All @@ -191,6 +208,8 @@ def __str__(self) -> str:
"s per node\n" if self.devices_per_node > 1 else " per node\n"
)
desc_str += "Devices: " + str(self.devices_per_node) + " " + devices_str
if self.first_device > 0:
desc_str += "First device: " + str(self.first_device) + "\n"
return desc_str


Expand All @@ -203,6 +222,7 @@ def __init__(
model_file: t.Optional[str] = None,
device: t.Literal["CPU", "GPU"] = "CPU",
devices_per_node: int = 1,
first_device: int = 0,
batch_size: int = 0,
min_batch_size: int = 0,
min_batch_timeout: int = 0,
Expand All @@ -227,6 +247,8 @@ def __init__(
:type device: str, optional
:param devices_per_node: number of devices to store the model on
:type devices_per_node: int
:param first_device: The first device to store the model on
:type first_device: int
:param batch_size: batch size for execution, defaults to 0
:type batch_size: int, optional
:param min_batch_size: minimum batch size for model execution, defaults to 0
Expand All @@ -240,7 +262,8 @@ def __init__(
:param outputs: model outupts (TF only), defaults to None
:type outputs: list[str], optional
"""
super().__init__(name, model, model_file, device, devices_per_node)
super().__init__(
name, model, model_file, device, devices_per_node, first_device)
self.backend = self._check_backend(backend)
if not model and not model_file:
raise ValueError("Either model or model_file must be provided")
Expand All @@ -264,6 +287,8 @@ def __str__(self) -> str:
"s per node\n" if self.devices_per_node > 1 else " per node\n"
)
desc_str += "Devices: " + str(self.devices_per_node) + " " + devices_str
if self.first_device > 0:
desc_str += "First_device: " + str(self.first_device) + "\n"
desc_str += "Backend: " + str(self.backend) + "\n"
if self.batch_size:
desc_str += "Batch size: " + str(self.batch_size) + "\n"
Expand Down
22 changes: 20 additions & 2 deletions smartsim/entity/ensemble.py
Original file line number Diff line number Diff line change
Expand Up @@ -362,6 +362,7 @@ def add_ml_model(
model_path: t.Optional[str] = None,
device: t.Literal["CPU", "GPU"] = "CPU",
devices_per_node: int = 1,
first_device: int = 0,
batch_size: int = 0,
min_batch_size: int = 0,
min_batch_timeout: int = 0,
Expand All @@ -388,6 +389,12 @@ def add_ml_model(
:type backend: str
:param device: name of device for execution, defaults to "CPU"
:type device: str, optional
:param devices_per_node: number of GPUs per node in multiGPU nodes,
defaults to 1
:type devices_per_node: int, optional
:param first_device: first device in multi-GPU nodes to use for execution,
defaults to 0; ignored if devices_per_node is 1
:type first_device: int, optional
:param batch_size: batch size for execution, defaults to 0
:type batch_size: int, optional
:param min_batch_size: minimum batch size for model execution, defaults to 0
Expand All @@ -408,6 +415,7 @@ def add_ml_model(
model_file=model_path,
device=device,
devices_per_node=devices_per_node,
first_device=first_device,
batch_size=batch_size,
min_batch_size=min_batch_size,
min_batch_timeout=min_batch_timeout,
Expand All @@ -426,6 +434,7 @@ def add_script(
script_path: t.Optional[str] = None,
device: t.Literal["CPU", "GPU"] = "CPU",
devices_per_node: int = 1,
first_device: int = 0,
) -> None:
"""TorchScript to launch with every entity belonging to this ensemble

Expand All @@ -452,13 +461,16 @@ def add_script(
:type device: str, optional
:param devices_per_node: number of devices on each host
:type devices_per_node: int
:param first_device: first device to use on each host
:type first_device: int
"""
db_script = DBScript(
name=name,
script=script,
script_path=script_path,
device=device,
devices_per_node=devices_per_node,
first_device=first_device,
)
self._db_scripts.append(db_script)
for entity in self.models:
Expand All @@ -470,6 +482,7 @@ def add_function(
function: t.Optional[str] = None,
device: t.Literal["CPU", "GPU"] = "CPU",
devices_per_node: int = 1,
first_device: int = 0,
) -> None:
"""TorchScript function to launch with every entity belonging to this ensemble

Expand All @@ -483,7 +496,9 @@ def add_function(
present, a number can be passed for specification e.g. "GPU:1".

Setting ``devices_per_node=N``, with N greater than one will result
in the model being stored in the first N devices of type ``device``.
in the script being stored in the first N devices of type ``device``;
alternatively, setting ``first_device=M`` will result in the script
being stored on nodes M through M + N - 1.

:param name: key to store function under
:type name: str
Expand All @@ -493,9 +508,12 @@ def add_function(
:type device: str, optional
:param devices_per_node: number of devices on each host
:type devices_per_node: int
:param first_device: first device to use on each host
:type first_device: int
"""
db_script = DBScript(
name=name, script=function, device=device, devices_per_node=devices_per_node
name=name, script=function, device=device,
devices_per_node=devices_per_node, first_device=first_device
)
self._db_scripts.append(db_script)
for entity in self.models:
Expand Down
Loading