Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Promote Build Device Option to Enum #527

Merged
merged 32 commits into from
Mar 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
25f1ed6
initial commit
amandarichardsonn Mar 20, 2024
cfe9dfa
reverting
amandarichardsonn Mar 21, 2024
8adbb74
enums
amandarichardsonn Mar 21, 2024
5d332c1
test
amandarichardsonn Mar 21, 2024
9b84146
test
amandarichardsonn Mar 21, 2024
05a72d3
sus
amandarichardsonn Mar 22, 2024
07de12f
test
amandarichardsonn Mar 22, 2024
69deada
test
amandarichardsonn Mar 22, 2024
bb0aa90
testing
amandarichardsonn Mar 22, 2024
caffc07
test
amandarichardsonn Mar 22, 2024
db164af
test
amandarichardsonn Mar 22, 2024
cd1f8e1
testing mypy
amandarichardsonn Mar 22, 2024
e155fa4
testing
amandarichardsonn Mar 22, 2024
4b93b4d
testing
amandarichardsonn Mar 22, 2024
d2a3a4f
import sorting
amandarichardsonn Mar 22, 2024
c8d1690
black
amandarichardsonn Mar 22, 2024
02690cf
changes
amandarichardsonn Mar 25, 2024
f40585e
changes
amandarichardsonn Mar 26, 2024
c33f615
mypy issue
amandarichardsonn Mar 26, 2024
2a33d5b
make black
amandarichardsonn Mar 26, 2024
be467a2
test
amandarichardsonn Mar 26, 2024
184978b
make style
amandarichardsonn Mar 26, 2024
a92a9bd
test
amandarichardsonn Mar 26, 2024
ddd0d92
works
amandarichardsonn Mar 26, 2024
5959319
test
amandarichardsonn Mar 26, 2024
8196396
address Matts comments
amandarichardsonn Mar 28, 2024
06f50ef
Merge branch 'develop' of https://github.com/CrayLabs/SmartSim in…
amandarichardsonn Mar 28, 2024
7625c03
lambda change
amandarichardsonn Mar 28, 2024
281f749
small changes
amandarichardsonn Mar 28, 2024
01d4582
test
amandarichardsonn Mar 28, 2024
252f22c
fix issue
amandarichardsonn Mar 28, 2024
8182e9d
test fixed
amandarichardsonn Mar 28, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions doc/changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ To be released at some future point in time

Description

- Promote device options to an Enum
- Update telemetry monitor, add telemetry collectors
- Add method to specify node features for a Slurm job
- Colo Orchestrator setup now blocks application start until setup finished
Expand All @@ -33,6 +34,7 @@ Description

Detailed Notes

- Promote devices to a dedicated Enum type throughout the SmartSim code base.
- Update the telemetry monitor to enable retrieval of metrics on a scheduled
interval. Switch basic experiment tracking telemetry to default to on. Add
database metric collectors. Improve telemetry monitor logging. Create
Expand Down Expand Up @@ -70,6 +72,7 @@ Detailed Notes
- Remove previously deprecated behavior present in test suite on machines with
Slurm and Open MPI. (SmartSim-PR520_)

.. _SmartSim-PR498: https://github.com/CrayLabs/SmartSim/pull/498
.. _SmartSim-PR460: https://github.com/CrayLabs/SmartSim/pull/460
.. _SmartSim-PR512: https://github.com/CrayLabs/SmartSim/pull/512
.. _SmartSim-PR529: https://github.com/CrayLabs/SmartSim/pull/529
Expand Down
33 changes: 15 additions & 18 deletions smartsim/_core/_cli/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@
VersionConflictError,
Versioner,
)
from smartsim._core._install.builder import BuildError
from smartsim._core._install.builder import BuildError, Device
from smartsim._core.config import CONFIG
from smartsim._core.utils.helpers import installed_redisai_backends
from smartsim.error import SSConfigError
Expand All @@ -54,8 +54,6 @@
# NOTE: all smartsim modules need full paths as the smart cli
# may be installed into a different directory.


_TDeviceStr = t.Literal["cpu", "gpu"]
_TPinningStr = t.Literal["==", "!=", ">=", ">", "<=", "<", "~="]


Expand Down Expand Up @@ -134,7 +132,7 @@ def build_database(
def build_redis_ai(
build_env: BuildEnv,
versions: Versioner,
device: _TDeviceStr,
device: Device,
use_torch: bool = True,
use_tf: bool = True,
use_onnx: bool = False,
Expand All @@ -143,7 +141,7 @@ def build_redis_ai(
verbose: bool = False,
) -> None:
# make sure user isn't trying to do something silly on MacOS
if build_env.PLATFORM == "darwin" and device == "gpu":
if build_env.PLATFORM == "darwin" and device == Device.GPU:
raise BuildError("SmartSim does not support GPU on MacOS")

# decide which runtimes to build
Expand All @@ -154,7 +152,7 @@ def build_redis_ai(
["ONNX", versions.ONNX, color_bool(use_onnx)],
]
print(tabulate(backends_table, tablefmt="fancy_outline"), end="\n\n")
print(f"Building for GPU support: {color_bool(device == 'gpu')}\n")
print(f"Building for GPU support: {color_bool(device == Device.GPU)}\n")

if not check_backends_install():
sys.exit(1)
Expand Down Expand Up @@ -195,7 +193,7 @@ def build_redis_ai(
else:
# get the build environment, update with CUDNN env vars
# if present and building for GPU, otherwise warn the user
if device == "gpu":
if device == Device.GPU:
gpu_env = build_env.get_cudnn_env()
cudnn_env_vars = [
"CUDNN_LIBRARY",
Expand Down Expand Up @@ -226,18 +224,16 @@ def build_redis_ai(
logger.info("ML Backends and RedisAI build complete!")


def check_py_torch_version(versions: Versioner, device_in: _TDeviceStr = "cpu") -> None:
def check_py_torch_version(versions: Versioner, device: Device = Device.CPU) -> None:
"""Check Python environment for TensorFlow installation"""

device = device_in.lower()
if BuildEnv.is_macos():
if device == "gpu":
if device == Device.GPU:
raise BuildError("SmartSim does not support GPU on MacOS")
device_suffix = ""
else: # linux
if device == "cpu":
if device == Device.CPU:
device_suffix = versions.TORCH_CPU_SUFFIX
elif device == "gpu":
elif device == Device.GPU:
device_suffix = versions.TORCH_CUDA_SUFFIX
else:
raise BuildError("Unrecognized device requested")
Expand All @@ -261,7 +257,9 @@ def check_py_torch_version(versions: Versioner, device_in: _TDeviceStr = "cpu")
"Torch version not found in python environment. "
"Attempting to install via `pip`"
)
wheel_device = device if device == "cpu" else device_suffix.replace("+", "")
wheel_device = (
device.value if device == Device.CPU else device_suffix.replace("+", "")
)
pip(
"install",
"--extra-index-url",
Expand Down Expand Up @@ -363,8 +361,7 @@ def execute(
) -> int:
verbose = args.v
keydb = args.keydb
device: _TDeviceStr = args.device

device = Device(args.device.lower())
# torch and tf build by default
pt = not args.no_pt # pylint: disable=invalid-name
tf = not args.no_tf # pylint: disable=invalid-name
Expand Down Expand Up @@ -453,8 +450,8 @@ def configure_parser(parser: argparse.ArgumentParser) -> None:
parser.add_argument(
"--device",
type=str.lower,
default="cpu",
choices=["cpu", "gpu"],
default=Device.CPU.value,
choices=[device.value for device in Device],
help="Device to build ML runtimes for",
)
parser.add_argument(
Expand Down
33 changes: 18 additions & 15 deletions smartsim/_core/_cli/validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@

from smartsim import Experiment
from smartsim._core._cli.utils import SMART_LOGGER_FORMAT
from smartsim._core._install.builder import Device
from smartsim._core.utils.helpers import installed_redisai_backends
from smartsim.log import get_logger

Expand All @@ -61,9 +62,6 @@
_TemporaryDirectory = tempfile.TemporaryDirectory


_TCapitalDeviceStr = t.Literal["CPU", "GPU"]


class _VerificationTempDir(_TemporaryDirectory):
"""A Temporary directory to be used as a context manager that will only
clean itself up if no error is raised within its context
Expand All @@ -88,7 +86,7 @@ def execute(
simple experiment
"""
backends = installed_redisai_backends()
device: _TCapitalDeviceStr = args.device.upper()
device: Device = Device(args.device)
try:
with contextlib.ExitStack() as ctx:
temp_dir = ctx.enter_context(_VerificationTempDir(dir=os.getcwd()))
Expand All @@ -98,7 +96,7 @@ def execute(
"SR_LOG_FILE", os.path.join(temp_dir, "smartredis.log")
),
}
if device == "GPU":
if device == Device.GPU:
validate_env["CUDA_VISIBLE_DEVICES"] = "0"
ctx.enter_context(_env_vars_set_to(validate_env))
test_install(
Expand Down Expand Up @@ -136,16 +134,16 @@ def configure_parser(parser: argparse.ArgumentParser) -> None:
parser.add_argument(
"--device",
type=str.lower,
default="cpu",
choices=["cpu", "gpu"],
default=Device.CPU.value,
choices=[device.value for device in Device],
help="Device to test the ML backends against",
)


def test_install(
location: str,
port: t.Optional[int],
device: _TCapitalDeviceStr,
device: Device,
with_tf: bool,
with_pt: bool,
with_onnx: bool,
Expand Down Expand Up @@ -214,7 +212,7 @@ def _find_free_port() -> int:
return int(port)


def _test_tf_install(client: Client, tmp_dir: str, device: _TCapitalDeviceStr) -> None:
def _test_tf_install(client: Client, tmp_dir: str, device: Device) -> None:
recv_conn, send_conn = mp.Pipe(duplex=False)
# Build the model in a subproc so that keras does not hog the gpu
proc = mp.Process(target=_build_tf_frozen_model, args=(send_conn, tmp_dir))
Expand All @@ -236,7 +234,12 @@ def _test_tf_install(client: Client, tmp_dir: str, device: _TCapitalDeviceStr) -
) from e

client.set_model_from_file(
"keras-fcn", model_path, "TF", device=device, inputs=inputs, outputs=outputs
"keras-fcn",
model_path,
"TF",
device=device.value.upper(),
inputs=inputs,
outputs=outputs,
)
client.put_tensor("keras-input", np.random.rand(1, 28, 28).astype(np.float32))
client.run_model("keras-fcn", inputs=["keras-input"], outputs=["keras-output"])
Expand Down Expand Up @@ -264,7 +267,7 @@ def _build_tf_frozen_model(conn: "Connection", tmp_dir: str) -> None:
conn.send((model_path, inputs, outputs))


def _test_torch_install(client: Client, device: _TCapitalDeviceStr) -> None:
def _test_torch_install(client: Client, device: Device) -> None:
import torch
from torch import nn

Expand All @@ -276,7 +279,7 @@ def __init__(self) -> None:
def forward(self, x: torch.Tensor) -> torch.Tensor:
return self.conv(x)

if device == "GPU":
if device == Device.GPU:
device_ = torch.device("cuda")
else:
device_ = torch.device("cpu")
Expand All @@ -292,13 +295,13 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
torch.jit.save(traced, buffer) # type: ignore[no-untyped-call]
model = buffer.getvalue()

client.set_model("torch-nn", model, backend="TORCH", device=device)
client.set_model("torch-nn", model, backend="TORCH", device=device.value.upper())
client.put_tensor("torch-in", torch.rand(1, 1, 3, 3).numpy())
client.run_model("torch-nn", inputs=["torch-in"], outputs=["torch-out"])
client.get_tensor("torch-out")


def _test_onnx_install(client: Client, device: _TCapitalDeviceStr) -> None:
def _test_onnx_install(client: Client, device: Device) -> None:
from skl2onnx import to_onnx
from sklearn.cluster import KMeans

Expand All @@ -311,7 +314,7 @@ def _test_onnx_install(client: Client, device: _TCapitalDeviceStr) -> None:
sample = np.arange(20, dtype=np.float32).reshape(10, 2)

client.put_tensor("onnx-input", sample)
client.set_model("onnx-kmeans", model, "ONNX", device=device)
client.set_model("onnx-kmeans", model, "ONNX", device=device.value.upper())
client.run_model(
"onnx-kmeans", inputs=["onnx-input"], outputs=["onnx-labels", "onnx-transform"]
)
Expand Down
Loading
Loading