From 611084c613705151d634f82e6ef1855c908bf927 Mon Sep 17 00:00:00 2001 From: Yongseung Lee <57741570+show981111@users.noreply.github.com> Date: Sat, 27 Apr 2024 20:24:58 -0400 Subject: [PATCH] Separate BSO as a server (#34) Co-authored-by: Jae-Won Chung --- .dockerignore | 1 + .gitignore | 3 + Dockerfile => docker/Dockerfile | 0 docker/bso_migration.Dockerfile | 12 + docker/bso_server.Dockerfile | 14 + docker/docker-compose.yaml | 84 ++++ docs/batch_size_optimizer/index.md | 200 ++++++++ docs/extend.md | 13 +- docs/gen_ref_pages.py | 2 + examples/ZeusDataLoader/capriccio/run_zeus.py | 2 +- examples/ZeusDataLoader/cifar100/run_zeus.py | 2 +- examples/ZeusDataLoader/imagenet/run_zeus.py | 2 +- examples/bso_server/README.md | 30 ++ examples/bso_server/mnist.Dockerfile | 13 + examples/bso_server/mnist_dp.py | 297 +++++++++++ examples/bso_server/mnist_dp.yaml | 51 ++ examples/bso_server/mnist_single_gpu.py | 264 ++++++++++ examples/bso_server/mnist_single_gpu.yaml | 30 ++ examples/bso_server/run_single.py | 410 +++++++++++++++ examples/trace_driven/README.md | 3 +- examples/trace_driven/run_alibaba.py | 4 +- examples/trace_driven/run_single.py | 7 +- mkdocs.yml | 8 +- pyproject.toml | 13 +- tests/optimizer/batch_size/conftest.py | 92 ++++ .../batch_size/simulate_with_server.py | 470 ++++++++++++++++++ tests/optimizer/batch_size/test_client.py | 154 ++++++ tests/optimizer/batch_size/test_explorer.py | 208 ++++++++ tests/optimizer/batch_size/test_server.py | 357 +++++++++++++ tests/optimizer/batch_size/test_simulator.py | 135 +++++ tests/test_batch_size_optimizer.py | 121 ----- zeus/__init__.py | 4 +- zeus/_legacy/__init__.py | 5 + zeus/{ => _legacy}/policy/__init__.py | 17 +- zeus/{ => _legacy}/policy/interface.py | 4 +- zeus/{ => _legacy}/policy/mab.py | 0 zeus/{ => _legacy}/policy/optimizer.py | 12 +- zeus/{ => _legacy}/simulate.py | 2 +- zeus/monitor/energy.py | 3 + zeus/monitor/power.py | 3 + zeus/optimizer/batch_size/__init__.py | 1 + zeus/optimizer/batch_size/alembic.ini | 116 +++++ zeus/optimizer/batch_size/client.py | 217 ++++++++ zeus/optimizer/batch_size/common.py | 172 +++++++ zeus/optimizer/batch_size/exceptions.py | 33 ++ .../batch_size/migrations/.gitignore | 1 + .../optimizer/batch_size/migrations/README.md | 38 ++ zeus/optimizer/batch_size/migrations/env.py | 98 ++++ .../batch_size/migrations/script.py.mako | 24 + .../batch_size/migrations/versions/.gitkeep | 0 zeus/optimizer/batch_size/server/__init__.py | 44 ++ .../server/batch_size_state/__init__.py | 4 + .../server/batch_size_state/commands.py | 167 +++++++ .../server/batch_size_state/models.py | 251 ++++++++++ .../server/batch_size_state/repository.py | 253 ++++++++++ zeus/optimizer/batch_size/server/config.py | 59 +++ .../batch_size/server/database/__init__.py | 1 + .../server/database/db_connection.py | 77 +++ .../batch_size/server/database/repository.py | 16 + .../batch_size/server/database/schema.py | 192 +++++++ .../optimizer/batch_size/server/exceptions.py | 57 +++ zeus/optimizer/batch_size/server/explorer.py | 126 +++++ .../batch_size/server/job/__init__.py | 1 + .../batch_size/server/job/commands.py | 160 ++++++ .../optimizer/batch_size/server/job/models.py | 80 +++ .../batch_size/server/job/repository.py | 168 +++++++ zeus/optimizer/batch_size/server/mab.py | 299 +++++++++++ zeus/optimizer/batch_size/server/optimizer.py | 280 +++++++++++ zeus/optimizer/batch_size/server/router.py | 193 +++++++ .../batch_size/server/services/__init__.py | 1 + .../batch_size/server/services/commands.py | 43 ++ .../batch_size/server/services/service.py | 393 +++++++++++++++ zeus/run/master.py | 6 +- 73 files changed, 6462 insertions(+), 161 deletions(-) rename Dockerfile => docker/Dockerfile (100%) create mode 100644 docker/bso_migration.Dockerfile create mode 100644 docker/bso_server.Dockerfile create mode 100644 docker/docker-compose.yaml create mode 100644 docs/batch_size_optimizer/index.md create mode 100644 examples/bso_server/README.md create mode 100644 examples/bso_server/mnist.Dockerfile create mode 100644 examples/bso_server/mnist_dp.py create mode 100644 examples/bso_server/mnist_dp.yaml create mode 100644 examples/bso_server/mnist_single_gpu.py create mode 100644 examples/bso_server/mnist_single_gpu.yaml create mode 100644 examples/bso_server/run_single.py create mode 100644 tests/optimizer/batch_size/conftest.py create mode 100644 tests/optimizer/batch_size/simulate_with_server.py create mode 100644 tests/optimizer/batch_size/test_client.py create mode 100644 tests/optimizer/batch_size/test_explorer.py create mode 100644 tests/optimizer/batch_size/test_server.py create mode 100644 tests/optimizer/batch_size/test_simulator.py delete mode 100644 tests/test_batch_size_optimizer.py create mode 100644 zeus/_legacy/__init__.py rename zeus/{ => _legacy}/policy/__init__.py (60%) rename zeus/{ => _legacy}/policy/interface.py (95%) rename zeus/{ => _legacy}/policy/mab.py (100%) rename zeus/{ => _legacy}/policy/optimizer.py (97%) rename zeus/{ => _legacy}/simulate.py (99%) create mode 100644 zeus/optimizer/batch_size/__init__.py create mode 100644 zeus/optimizer/batch_size/alembic.ini create mode 100644 zeus/optimizer/batch_size/client.py create mode 100644 zeus/optimizer/batch_size/common.py create mode 100644 zeus/optimizer/batch_size/exceptions.py create mode 100644 zeus/optimizer/batch_size/migrations/.gitignore create mode 100644 zeus/optimizer/batch_size/migrations/README.md create mode 100644 zeus/optimizer/batch_size/migrations/env.py create mode 100644 zeus/optimizer/batch_size/migrations/script.py.mako create mode 100644 zeus/optimizer/batch_size/migrations/versions/.gitkeep create mode 100644 zeus/optimizer/batch_size/server/__init__.py create mode 100644 zeus/optimizer/batch_size/server/batch_size_state/__init__.py create mode 100644 zeus/optimizer/batch_size/server/batch_size_state/commands.py create mode 100644 zeus/optimizer/batch_size/server/batch_size_state/models.py create mode 100644 zeus/optimizer/batch_size/server/batch_size_state/repository.py create mode 100644 zeus/optimizer/batch_size/server/config.py create mode 100644 zeus/optimizer/batch_size/server/database/__init__.py create mode 100644 zeus/optimizer/batch_size/server/database/db_connection.py create mode 100644 zeus/optimizer/batch_size/server/database/repository.py create mode 100644 zeus/optimizer/batch_size/server/database/schema.py create mode 100644 zeus/optimizer/batch_size/server/exceptions.py create mode 100644 zeus/optimizer/batch_size/server/explorer.py create mode 100644 zeus/optimizer/batch_size/server/job/__init__.py create mode 100644 zeus/optimizer/batch_size/server/job/commands.py create mode 100644 zeus/optimizer/batch_size/server/job/models.py create mode 100644 zeus/optimizer/batch_size/server/job/repository.py create mode 100644 zeus/optimizer/batch_size/server/mab.py create mode 100644 zeus/optimizer/batch_size/server/optimizer.py create mode 100644 zeus/optimizer/batch_size/server/router.py create mode 100644 zeus/optimizer/batch_size/server/services/__init__.py create mode 100644 zeus/optimizer/batch_size/server/services/commands.py create mode 100644 zeus/optimizer/batch_size/server/services/service.py diff --git a/.dockerignore b/.dockerignore index 7f582064..c79610d2 100644 --- a/.dockerignore +++ b/.dockerignore @@ -10,3 +10,4 @@ zeus.egg-info/ .git/ **/data/ +**/versions/*.py \ No newline at end of file diff --git a/.gitignore b/.gitignore index bc951838..e1015935 100644 --- a/.gitignore +++ b/.gitignore @@ -11,4 +11,7 @@ dist/ *.json **/.DS_Store .cache/ +.env env/ +.pytest_cache/ +/envs diff --git a/Dockerfile b/docker/Dockerfile similarity index 100% rename from Dockerfile rename to docker/Dockerfile diff --git a/docker/bso_migration.Dockerfile b/docker/bso_migration.Dockerfile new file mode 100644 index 00000000..983bfe83 --- /dev/null +++ b/docker/bso_migration.Dockerfile @@ -0,0 +1,12 @@ +FROM python:3.9 + +WORKDIR /workspace + +ADD . /workspace + +# For sqlite +# RUN pip install --no-cache-dir aiosqlite + +# For mysql +RUN pip install --no-cache-dir asyncmy +RUN pip install --no-cache-dir '.[migration]' diff --git a/docker/bso_server.Dockerfile b/docker/bso_server.Dockerfile new file mode 100644 index 00000000..43fa6bf1 --- /dev/null +++ b/docker/bso_server.Dockerfile @@ -0,0 +1,14 @@ +FROM python:3.9 + +WORKDIR /workspace + +ADD . /workspace + +# For sqlite +# RUN pip install --no-cache-dir aiosqlite + +# For mysql +RUN pip install --no-cache-dir asyncmy +RUN pip install --no-cache-dir '.[bso-server]' + +CMD ["uvicorn", "zeus.optimizer.batch_size.server.router:app", "--host", "0.0.0.0", "--port", "80"] diff --git a/docker/docker-compose.yaml b/docker/docker-compose.yaml new file mode 100644 index 00000000..e860be2a --- /dev/null +++ b/docker/docker-compose.yaml @@ -0,0 +1,84 @@ +version: '3.9' +name: zeus_bso_server + +services: + server: + image: bso-server + build: + context: ../ + dockerfile: ./docker/bso_server.Dockerfile + container_name: bso + restart: always + environment: + ZEUS_BSO_DATABASE_URL: ${ZEUS_BSO_DATABASE_URL-mysql+asyncmy://${ZEUS_BSO_DB_USER}:${ZEUS_BSO_DB_PASSWORD}@db:3306/Zeus} + ZEUS_BSO_LOG_LEVEL: ${ZEUS_BSO_LOG_LEVEL} + ZEUS_BSO_ECHO_SQL: ${ZEUS_BSO_ECHO_SQL} + ports: + # Map 80 to the container + - "80:80" + networks: + - servernet + depends_on: + migration: + # start running when migration is done. + condition: service_completed_successfully + labels: + # labels for kubernetes + kompose.service.type: nodeport + # Pull image only when there is no image locally. Otherewise use that one. + kompose.image-pull-policy: IfNotPresent + # set the node port. Should be 30000-32767 + kompose.service.nodeport.port: ${ZEUS_BSO_SERVER_PORT-30100} + db: + image: mysql + container_name: db + restart: always + environment: + MYSQL_DATABASE: Zeus + MYSQL_USER: ${ZEUS_BSO_DB_USER} + MYSQL_ROOT_PASSWORD: ${ZEUS_BSO_ROOT_PASSWORD} + MYSQL_PASSWORD: ${ZEUS_BSO_DB_PASSWORD} + expose: + # Opens 3306 on the container to server & migration + - 3306 + volumes: + - ./mysql_data:/var/lib/mysql + networks: + - servernet + healthcheck: + test: ["CMD", "mysqladmin" ,"ping", "-h", "localhost"] + timeout: 3s + retries: 10 + start_period: 2s + start_interval: 1s + + migration: + image: bso-migration + build: + context: ../ + dockerfile: ./docker/bso_migration.Dockerfile + deploy: + restart_policy: + condition: on-failure + max_attempts: 3 + depends_on: + db: + # wait until db is ready to accept connection + condition: service_healthy + # Generate revision and upgrade database. Change message of revision as you want + command: > + bash -c 'cd /workspace/zeus/optimizer/batch_size && alembic revision --autogenerate -m "Baseline: create tables" && alembic upgrade head' + environment: + ZEUS_BSO_DATABASE_URL: ${ZEUS_BSO_DATABASE_URL-mysql+asyncmy://${ZEUS_BSO_DB_USER}:${ZEUS_BSO_DB_PASSWORD}@db:3306/Zeus} + networks: + - servernet + volumes: + # mount version scripts we generated. + - ./zeus/optimizer/batch_size/migrations/versions:/workspace/zeus/optimizer/batch_size/migrations/versions + labels: + kompose.image-pull-policy: IfNotPresent + + +networks: + servernet: + driver: bridge diff --git a/docs/batch_size_optimizer/index.md b/docs/batch_size_optimizer/index.md new file mode 100644 index 00000000..b60e497e --- /dev/null +++ b/docs/batch_size_optimizer/index.md @@ -0,0 +1,200 @@ +# Batch Size Optimizer in Zeus + +## What is it? + +Batch size optimizer(BSO) can choose the best batch size that minimizes the cost, where cost is defined as $cost = \eta \times \text{Energy consumption to accuracy} + (1-\eta) \times \text{Max power}\times \text{Time to accuracy}$. + +## How does it work? + +The core of BSO is a multi-arm-bandit based on **recurrent** training. After each training, we feed the result cost to MAB and after a certain number of trainings, MAB can converge to the best batch size. In addition to MAB, we employed early-stopping and pruning to handle stragglers. For more details, refer to [paper](https://www.usenix.org/conference/nsdi23/presentation/you). + +## Should I use this? + +The key of BSO is recurrent training. If you are training your model periodically or repeatedly, BSO can be a great choice to reduce energy or time consumption. + +## Limitations + +We currently don't support heterogeneous GPUs or different configurations. The number of GPUs, GPU models, and other configurations in JobSpec should be identical in recurrent training. If you are running your training in a various environment each time, then it might not be desirable to use BSO. + +## Sequence diagram of BSO + +```mermaid +sequenceDiagram; + participant BSO server + participant BSO client + loop Every recurrent training + BSO client->>BSO server: Register the training job and ask for the batch size + BSO server->>BSO client: Return the next batch size to use with a trial number + loop Every epoch + BSO client->>BSO server: At the end of each epoch, report the result + BSO server->>BSO client: Compute the cost and tell the client if it should stop the training + end + BSO client->>BSO server: Report the end of the trial on exit + end +``` + +## Quick start (Server) + +1. Clone the repository + + ```Shell + git clone https://github.com/ml-energy/zeus/tree/master + ``` + +2. Create `.env` under `/docker`. An example of `.env` is provided below. + + By default, we are using the MySQL for the database. + + ```Shell + ZEUS_BSO_DB_USER="me" + ZEUS_BSO_DB_PASSWORD="secret" + ZEUS_BSO_ROOT_PASSWORD="secret*" + ZEUS_BSO_SERVER_PORT=8000 + ZEUS_BSO_LOG_LEVEL="INFO" + ZEUS_BSO_ECHO_SQL="True" + ``` + + If you want to use different databases, you need to add `ZEUS_BSO_DATABASE_URL` as an environment variable. See [Remark](#remark-about-server) for detail. + Also, if you are running using docker-compose or Kubernetes, you need to change the image name under `db` in the docker-compose file. + +3. Running a server + + - Using docker-compose + + ```Shell + cd docker + docker-compose -f ./docker/docker-compose.yaml up + ``` + + This will build images for each container: db, migration, and the server. Then, it will spin those containers. + + - Using Kubernetes. + + 1. Build an image. + + ```Shell + # From the root directory + docker build -f ./docker/bso_server.Dockerfile -t bso-server . + docker build -f ./docker/bso_migration.Dockerfile -t bso-migration . + ``` + + 2. Create Kubernetes yaml files using Kompose. Kompose is a tool that converts docker-compose files into Kubernetes files. For more information, visit [Kompose Reference](#kompose-references) + + ```Shell + cd docker + docker-compose config > docker-compose-resolved.yaml && kompose convert -f docker-compose-resolved.yaml -o ./kube/ && rm docker-compose-resolved.yaml + ``` + + It first resolves env files using docker-compose, then creates Kubernetes yaml files under `docker/kube/` + + 3. Run kubernetes. + + ```Shell + cd kube + kubectl apply -f . + ``` + + - Using uvicorn. + + If you are using the uvicorn to spin the server, you need to create a database and perform migration before starting the server. + + 1. Run the database of your choice. + 2. Set the environment variables in `.env` + + ```Shell + ZEUS_BSO_DATABASE_URL="me" + ZEUS_BSO_LOG_LEVEL="INFO" + ZEUS_BSO_ECHO_SQL="True" + ``` + + 3. Run Alembic migration + + 1. Install dependencies + + ```Bash + pip install '.[migration]' + ``` + + 2. Create the migration script. This will create scripts under ./versions + + ```Bash + alembic revision --autogenerate -m "Baseline: create tables" + ``` + + 3. Apply migration + 1. Online (apply it to database directly) + + ```Bash + alembic upgrade head + ``` + + 2. Offline (generate sql) + + ```Bash + alembic upgrade head --sql + ``` + + 4. Run the server using uvicorn. + + ```Shell + cd zeus/optimizer/batch_size/server + uvicorn router:app --reload + ``` + + Now the server is good to go! + +### Remark about the server + +Zeus Batch Size Optimizer server is using Sqlalchemy to support various types of databases. However, you need to download the corresponding async connection driver. +As a default, we are using Mysql. You can add installation code to `bso_migration.Dockerfile` and `bso_server.Dockerfile`. Refer to those files for reference. + +## Use BSO in your training script (Client) + +1. Install Zeus package. + + ```Shell + pip install zeus-ml[bso] + ``` + +2. Add [`BatchSizeOptimizer`][zeus.optimizer.batch_size.client.BatchSizeOptimizer] to your training script. + + ```Python + # Initialization + bso = BatchSizeOptimizer( + monitor=monitor, + server_url="http://127.0.0.1:8000", + job=JobParams( + job_id_prefix="mnist-dev", + default_batch_size=256, + batch_sizes=[32, 64, 256, 512, 1024, 4096, 2048], + max_epochs=100 + ), + ) + # ... other codes + + # Get batch size to use from the server + batch_size = bso.get_batch_size() + + # ... + + # beginning of the train + bso.on_train_begin() + + # ... + + # After evaluation + bso.on_evaluate(metric) + ``` + +### Remark about the client + +Training can fail if + +1. It failed to converge within configured max_epochs +2. It exceeded the early stopping threshold which is configured by `beta_knob` in `JobSpec` + +In that case, the optimizer will raise `ZeusBSOTrainFailError`. This means that the chosen batch size was not useful, and the BSO server will not give this batch size again. However, the user ***should re-launch the job*** so that the BSO server can give another batch size. The server will learn which batch size is useful and will converge to the batch size that causes the least cost as you launch the job multiple times. + +## Kompose references + +Refer [Kompose](https://kompose.io/) and [Kompose labels](https://github.com/kubernetes/kompose/blob/main/docs/user-guide.md) for more information. diff --git a/docs/extend.md b/docs/extend.md index 497b2d54..638713d2 100644 --- a/docs/extend.md +++ b/docs/extend.md @@ -9,12 +9,11 @@ Users can implement custom policies to optimize batch size and power limits, and ## Interfaces -Zeus defines two abstract classes [`BatchSizeOptimizer`][zeus.policy.BatchSizeOptimizer] and [`PowerLimitOptimizer`][zeus.policy.PowerLimitOptimizer] in [`zeus.policy.interface`][zeus.policy.interface]. +Zeus defines two abstract classes [`BatchSizeOptimizer`][zeus._legacy.policy.BatchSizeOptimizer] and [`PowerLimitOptimizer`][zeus._legacy.policy.PowerLimitOptimizer] in [`zeus._legacy.policy.interface`][zeus._legacy.policy.interface]. Each class optimizes the batch size and power limit of a recurring training job respectively. As in our paper, the batch size optimizer is first invoked to decide which batch size to use, and then the power limit optimizer is invoked with both the job and the batch size chosen to decide which power limit to use. -You can find examples of policy implementations in [`zeus.policy.optimizer`][zeus.policy.optimizer]. - +You can find examples of policy implementations in [`zeus._legacy.policy.optimizer`][zeus._legacy.policy.optimizer]. ## Plugging it into Zeus @@ -22,13 +21,13 @@ There are two ways to run Zeus: trace-driven and end-to-end. ### Trace-driven Zeus -The Zeus simulator ([`Simulator`][zeus.simulate.Simulator]) accepts one [`BatchSizeOptimizer`][zeus.policy.BatchSizeOptimizer] and [`PowerLimitOptimizer`][zeus.policy.PowerLimitOptimizer] in its constructor. +The Zeus simulator ([`Simulator`][zeus._legacy.simulate.Simulator]) accepts one [`BatchSizeOptimizer`][zeus._legacy.policy.BatchSizeOptimizer] and [`PowerLimitOptimizer`][zeus._legacy.policy.PowerLimitOptimizer] in its constructor. A full-example can be found in [`examples/trace_driven`](https://github.com/ml-energy/zeus/tree/master/examples/trace_driven/). ### End-to-end Zeus There are two central components in end-to-end Zeus: [`ZeusMaster`][zeus.run.ZeusMaster] and [`ZeusDataLoader`][zeus.run.ZeusDataLoader]. -The former takes charge of driving the entire optimization over recurring jobs, and accepts an instance of [`BatchSizeOptimizer`][zeus.policy.BatchSizeOptimizer] in its constructor. +The former takes charge of driving the entire optimization over recurring jobs, and accepts an instance of [`BatchSizeOptimizer`][zeus._legacy.policy.BatchSizeOptimizer] in its constructor. The latter takes charge of JIT-profiling power in the background, determining the optimal power limit, and setting it. -Hence, the functionality of [`JITPowerLimitOptimizer`][zeus.policy.optimizer.JITPowerLimitOptimizer] is already tightly integrated into `ZeusDataLoader`. -Users will have to implement their own [`ZeusDataLoader`][zeus.run.ZeusDataLoader] in order to test another [`PowerLimitOptimizer`][zeus.policy.PowerLimitOptimizer] policy. +Hence, the functionality of [`JITPowerLimitOptimizer`][zeus._legacy.policy.optimizer.JITPowerLimitOptimizer] is already tightly integrated into `ZeusDataLoader`. +Users will have to implement their own [`ZeusDataLoader`][zeus.run.ZeusDataLoader] in order to test another [`PowerLimitOptimizer`][zeus._legacy.policy.PowerLimitOptimizer] policy. diff --git a/docs/gen_ref_pages.py b/docs/gen_ref_pages.py index d66e04fa..14b637a1 100644 --- a/docs/gen_ref_pages.py +++ b/docs/gen_ref_pages.py @@ -26,6 +26,8 @@ for path in sorted(Path("zeus").rglob("*.py")): # Path to the generated markdown file. doc_path = path.relative_to("zeus").with_suffix(".md") + if str(doc_path).find("batch_size/migration") != -1: + continue full_doc_path = REF_DIR / doc_path module_path = path.with_suffix("") diff --git a/examples/ZeusDataLoader/capriccio/run_zeus.py b/examples/ZeusDataLoader/capriccio/run_zeus.py index f759e969..fa242991 100644 --- a/examples/ZeusDataLoader/capriccio/run_zeus.py +++ b/examples/ZeusDataLoader/capriccio/run_zeus.py @@ -19,7 +19,7 @@ from pathlib import Path from zeus.job import Job -from zeus.policy import PruningGTSBatchSizeOptimizer +from zeus._legacy.policy import PruningGTSBatchSizeOptimizer from zeus.run import ZeusMaster from zeus.util import FileAndConsole diff --git a/examples/ZeusDataLoader/cifar100/run_zeus.py b/examples/ZeusDataLoader/cifar100/run_zeus.py index ed9fa113..21efa079 100644 --- a/examples/ZeusDataLoader/cifar100/run_zeus.py +++ b/examples/ZeusDataLoader/cifar100/run_zeus.py @@ -19,7 +19,7 @@ from pathlib import Path from zeus.job import Job -from zeus.policy import PruningGTSBatchSizeOptimizer +from zeus._legacy.policy import PruningGTSBatchSizeOptimizer from zeus.run import ZeusMaster from zeus.util import FileAndConsole diff --git a/examples/ZeusDataLoader/imagenet/run_zeus.py b/examples/ZeusDataLoader/imagenet/run_zeus.py index 03e05f56..bcea9303 100644 --- a/examples/ZeusDataLoader/imagenet/run_zeus.py +++ b/examples/ZeusDataLoader/imagenet/run_zeus.py @@ -5,7 +5,7 @@ from pathlib import Path from zeus.job import Job -from zeus.policy import PruningGTSBatchSizeOptimizer +from zeus._legacy.policy import PruningGTSBatchSizeOptimizer from zeus.run import ZeusMaster from zeus.util import FileAndConsole diff --git a/examples/bso_server/README.md b/examples/bso_server/README.md new file mode 100644 index 00000000..56bc6e36 --- /dev/null +++ b/examples/bso_server/README.md @@ -0,0 +1,30 @@ +# Batch Size Optimizer in Zeus + +Batch size optimzer is composed of two parts: server and client. Client will be running in your training script just like power limit optimizer or monitor. This client will send training result to BSO server and server will give the client the best batch size to use. Refer to the `docs/batch_size_optimizer/server.md` for how to get started. + +## Data parallel training with Zeus + +In the case of data parallel training, Batch size optimizer should be able to give the consistent batch size to all gpus. Since there is no way for batch size to tell the differences between concurrent job submissions and multiple GPU training, we ask users to send a request from a single GPU and broadcast the result(batch size, trial number) to other GPUs. In the case of reporting the result to the batch size optimizer server and receiving the corresponding result (train fail or succeeded) can be dealt by the server since it has the `trial_number`. Thus, report doesn't require any broadcast or communications with other GPUs. +Refer to the `examples/bso_server/mnist_dp.py` for the use case. + +## Kubeflow + +Kubeflow is a tool to easily deploy your ML workflows to kubernetes. We provides some examples of using kubeflow with Zeus. In order to run your training in Kubeflow with Zeus, follow the `docs/batch_size_optimizer/server.md` to deploy batch size optimizer to kubernetes. After then, you can deploy your training script using kubeflow. + +1. Install kubeflow training operator. + + Refer [Kubeflow training operator](https://github.com/kubeflow/training-operator) to how to install kubeflow. + +2. Build mnist example docker image. + + ```Shell + # From project root directory + docker build -f ./examples/bso_server/mnist.Dockerfile -t mnist-example . + ``` + +3. Deploy training script. + + ```Shell + kubectl apply -f mnist_dp.yaml # For distributed training example + kubectl apply -f mnist_single_gpu.yaml # For single gpu training example + ``` diff --git a/examples/bso_server/mnist.Dockerfile b/examples/bso_server/mnist.Dockerfile new file mode 100644 index 00000000..4a9d8290 --- /dev/null +++ b/examples/bso_server/mnist.Dockerfile @@ -0,0 +1,13 @@ +FROM pytorch/pytorch:2.1.2-cuda11.8-cudnn8-runtime +# FROM nvcr.io/nvidia/pytorch:24.01-py3 + +RUN pip install tensorboardX + +WORKDIR /workspace + +ADD . /workspace + +RUN pip install --no-cache-dir -e '.[bso]' + +RUN chgrp -R 0 /workspace \ + && chmod -R g+rwX /workspace \ No newline at end of file diff --git a/examples/bso_server/mnist_dp.py b/examples/bso_server/mnist_dp.py new file mode 100644 index 00000000..7154ff5a --- /dev/null +++ b/examples/bso_server/mnist_dp.py @@ -0,0 +1,297 @@ +"""Mnist example from https://github.com/kubeflow/training-operator/blob/c20422067e3ef81df39d03c6f285353344d8f77d/examples/pytorch/mnist/mnist.py""" + +from __future__ import print_function + +import argparse +import os +from time import sleep + +import torch +import torch.distributed as dist +import torch.nn as nn +import torch.nn.functional as F +import torch.optim as optim +from tensorboardX import SummaryWriter +from torch.utils.data import DistributedSampler +from torchvision import datasets, transforms + +from zeus.callback import Callback, CallbackSet +from zeus.monitor import ZeusMonitor +from zeus.optimizer import GlobalPowerLimitOptimizer +from zeus.optimizer.batch_size.client import BatchSizeOptimizer +from zeus.optimizer.batch_size.common import JobSpec +from zeus.optimizer.batch_size.exceptions import ZeusBSOTrainFailError +from zeus.optimizer.power_limit import MaxSlowdownConstraint +from zeus.util.env import get_env + + +class Net(nn.Module): + def __init__(self): + super(Net, self).__init__() + self.conv1 = nn.Conv2d(1, 20, 5, 1) + self.conv2 = nn.Conv2d(20, 50, 5, 1) + self.fc1 = nn.Linear(4 * 4 * 50, 500) + self.fc2 = nn.Linear(500, 10) + + def forward(self, x): + x = F.relu(self.conv1(x)) + x = F.max_pool2d(x, 2, 2) + x = F.relu(self.conv2(x)) + x = F.max_pool2d(x, 2, 2) + x = x.view(-1, 4 * 4 * 50) + x = F.relu(self.fc1(x)) + x = self.fc2(x) + return F.log_softmax(x, dim=1) + + +def train(args, model, device, train_loader, epoch, writer, callbacks: CallbackSet): + model.train() + optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum) + + for batch_idx, (data, target) in enumerate(train_loader): + ### Zeus usage: call callback for step begin + callbacks.on_step_begin() + + # Attach tensors to the device. + data, target = data.to(device), target.to(device) + + optimizer.zero_grad() + output = model(data) + loss = F.nll_loss(output, target) + loss.backward() + optimizer.step() + if batch_idx % args.log_interval == 0: + print( + "Train Epoch: {} [{}/{} ({:.0f}%)]\tloss={:.4f}".format( + epoch, + batch_idx * len(data), + len(train_loader.dataset), + 100.0 * batch_idx / len(train_loader), + loss.item(), + ) + ) + niter = epoch * len(train_loader) + batch_idx + writer.add_scalar("loss", loss.item(), niter) + + ### Zeus usage: call callback for step end + callbacks.on_step_end() + + +def test(model, device, test_loader, writer, epoch) -> float: + model.eval() + + correct = 0 + with torch.no_grad(): + for data, target in test_loader: + # Attach tensors to the device. + data, target = data.to(device), target.to(device) + + output = model(data) + # Get the index of the max log-probability. + pred = output.max(1, keepdim=True)[1] + correct += pred.eq(target.view_as(pred)).sum().item() + + print("\naccuracy={:.4f}\n".format(float(correct) / len(test_loader.dataset))) + writer.add_scalar("accuracy", float(correct) / len(test_loader.dataset), epoch) + return float(correct) / len(test_loader.dataset) + + +def main(): + # Training settings + parser = argparse.ArgumentParser(description="PyTorch FashionMNIST Example") + + parser.add_argument( + "--test-batch-size", + type=int, + default=1000, + metavar="N", + help="input batch size for testing (default: 1000)", + ) + parser.add_argument( + "--epochs", + type=int, + default=1, + metavar="N", + help="number of epochs to train (default: 10)", + ) + parser.add_argument( + "--lr", + type=float, + default=0.01, + metavar="LR", + help="learning rate (default: 0.01)", + ) + parser.add_argument( + "--momentum", + type=float, + default=0.5, + metavar="M", + help="SGD momentum (default: 0.5)", + ) + parser.add_argument( + "--no-cuda", + action="store_true", + default=False, + help="disables CUDA training", + ) + parser.add_argument( + "--seed", + type=int, + default=1, + metavar="S", + help="random seed (default: 1)", + ) + parser.add_argument( + "--log-interval", + type=int, + default=10, + metavar="N", + help="how many batches to wait before logging training status", + ) + parser.add_argument( + "--save-model", + action="store_true", + default=False, + help="For Saving the current Model", + ) + parser.add_argument( + "--dir", + default="logs", + metavar="L", + help="directory where summary logs are stored", + ) + + parser.add_argument( + "--backend", + type=str, + help="Distributed backend", + choices=[dist.Backend.GLOO, dist.Backend.NCCL, dist.Backend.MPI], + default=dist.Backend.GLOO, + ) + + args = parser.parse_args() + use_cuda = not args.no_cuda and torch.cuda.is_available() + if use_cuda: + print("Using CUDA") + if args.backend != dist.Backend.NCCL: + print( + "Warning. Please use `nccl` distributed backend for the best performance using GPUs" + ) + + writer = SummaryWriter(args.dir) + + torch.manual_seed(args.seed) + + print("Using distributed PyTorch with {} backend".format(args.backend)) + # Set distributed training environment variables to run this training script locally. + if "WORLD_SIZE" not in os.environ: + os.environ["RANK"] = "0" + os.environ["WORLD_SIZE"] = "1" + os.environ["MASTER_ADDR"] = "localhost" + os.environ["MASTER_PORT"] = "1234" + + rank = int(os.getenv("RANK")) + world_size = int(os.getenv("WORLD_SIZE")) + + print(f"World Size: {os.environ['WORLD_SIZE']}. Rank: {rank}") + + dist.init_process_group(backend=args.backend, init_method="env://") + + device = torch.device("cuda" if use_cuda else "cpu") + model = Net().to(device) + + model = nn.parallel.DistributedDataParallel(model) + + # Get FashionMNIST train and test dataset. + train_ds = datasets.FashionMNIST( + "../data", + train=True, + download=True, + transform=transforms.Compose([transforms.ToTensor()]), + ) + test_ds = datasets.FashionMNIST( + "../data", + train=False, + download=True, + transform=transforms.Compose([transforms.ToTensor()]), + ) + + ########################### ZEUS INIT BEGIN ########################### + monitor = ZeusMonitor(gpu_indices=None) # All visible GPUs. + bso = BatchSizeOptimizer( + monitor=monitor, + server_url=get_env("ZEUS_SERVER_URL", str, "http://localhost:30100"), + job=JobSpec( + job_id=get_env("ZEUS_JOB_ID", str, "mnist-dev-dp-1"), + job_id_prefix="mnist-dev", + default_batch_size=256, + batch_sizes=[32, 64, 256, 512, 1024, 4096, 2048], + max_epochs=5, + ), + rank=rank, + ) + + # The rank 0 process will monitor and optimize the power limit of all GPUs. + if rank == 0: + callback_set: list[Callback] = [ + # plo + GlobalPowerLimitOptimizer( + monitor=monitor, + optimum_selector=MaxSlowdownConstraint( + factor=get_env("ZEUS_MAX_SLOWDOWN", float, 1.1), + ), + warmup_steps=10, + profile_steps=40, + pl_step=25, + ), + bso, + ] + # Get batch size from bso + batch_size = bso.get_batch_size() + print("Rank", dist.get_rank()) + print("Chosen batach_size:", batch_size) + # Need to broadcast 1. batch size 2. trial_number to identify current trial from other GPUs + bs_trial_tensor = torch.tensor([batch_size, bso.trial_number], device="cuda") + else: + sleep(3) # Wait for the initailization of rank=0 gpu + callback_set = [bso] + bs_trial_tensor = torch.tensor([0, 0], device="cuda") + print("Rank", dist.get_rank()) + + dist.broadcast(bs_trial_tensor, src=0) + batch_size = bs_trial_tensor[0].item() // world_size + bso.trial_number = bs_trial_tensor[1].item() + + print(f"Batach_size to use for gpu[{rank}]: {batch_size}") + callbacks = CallbackSet(callback_set) + + ########################### ZEUS INIT END ########################### + + # Add train and test loaders. + train_loader = torch.utils.data.DataLoader( + train_ds, + batch_size=batch_size, + sampler=DistributedSampler(train_ds), + ) + test_loader = torch.utils.data.DataLoader( + test_ds, + batch_size=args.test_batch_size, + sampler=DistributedSampler(test_ds), + ) + + ########################### ZEUS USAGE BEGIN ########################### + callbacks.on_train_begin() + for epoch in range(1, args.epochs + 1): + callbacks.on_epoch_begin() + train(args, model, device, train_loader, epoch, writer, callbacks) + callbacks.on_epoch_end() + acc = test(model, device, test_loader, writer, epoch) + callbacks.on_evaluate(acc) + ########################### ZEUS USAGE END ########################### + + if args.save_model: + torch.save(model.state_dict(), "mnist_cnn.pt") + + +if __name__ == "__main__": + main() diff --git a/examples/bso_server/mnist_dp.yaml b/examples/bso_server/mnist_dp.yaml new file mode 100644 index 00000000..f9871dad --- /dev/null +++ b/examples/bso_server/mnist_dp.yaml @@ -0,0 +1,51 @@ +apiVersion: "kubeflow.org/v1" +kind: PyTorchJob +metadata: + name: pytorch-zeus-mnist-dp + namespace: kubeflow +spec: + pytorchReplicaSpecs: + Master: + replicas: 1 + restartPolicy: OnFailure + template: + spec: + containers: + - name: pytorch + image: mnist-example + imagePullPolicy: Never + command: + - "python3" + - "/workspace/examples/bso_server/mnist_dp.py" + - "--epochs=5" + - "--backend=nccl" + env: + - name: ZEUS_SERVER_URL + value: "http://server:80" + - name: ZEUS_JOB_ID + value: "mnist-dev-dp-2" + securityContext: + capabilities: + add: ["SYS_ADMIN"] + Worker: + replicas: 1 + restartPolicy: OnFailure + template: + spec: + containers: + - name: pytorch + image: mnist-example + imagePullPolicy: Never + command: + - "python3" + - "/workspace/examples/bso_server/mnist_dp.py" + - "--epochs=5" + - "--backend=nccl" + env: + - name: ZEUS_SERVER_URL + value: "http://server:80" + - name: ZEUS_JOB_ID + value: "mnist-dev-dp-2" + securityContext: + capabilities: + add: ["SYS_ADMIN"] \ No newline at end of file diff --git a/examples/bso_server/mnist_single_gpu.py b/examples/bso_server/mnist_single_gpu.py new file mode 100644 index 00000000..8c436283 --- /dev/null +++ b/examples/bso_server/mnist_single_gpu.py @@ -0,0 +1,264 @@ +""" +Based on https://github.com/kubeflow/pytorch-operator/blob/master/examples/mnist/mnist.py +""" + +from __future__ import print_function + +import argparse +import os + +from tensorboardX import SummaryWriter +from torchvision import datasets, transforms +import torch +import torch.distributed as dist +import torch.nn as nn +import torch.nn.functional as F +import torch.optim as optim +from zeus.monitor import ZeusMonitor +from zeus.optimizer import GlobalPowerLimitOptimizer +from zeus.optimizer.batch_size.client import BatchSizeOptimizer +from zeus.optimizer.batch_size.common import JobSpec + + +WORLD_SIZE = int(os.environ.get("WORLD_SIZE", 1)) + +class Net(nn.Module): + def __init__(self): + super(Net, self).__init__() + self.conv1 = nn.Conv2d(1, 20, 5, 1) + self.conv2 = nn.Conv2d(20, 50, 5, 1) + self.fc1 = nn.Linear(4 * 4 * 50, 500) + self.fc2 = nn.Linear(500, 10) + + def forward(self, x): + x = F.relu(self.conv1(x)) + x = F.max_pool2d(x, 2, 2) + x = F.relu(self.conv2(x)) + x = F.max_pool2d(x, 2, 2) + x = x.view(-1, 4 * 4 * 50) + x = F.relu(self.fc1(x)) + x = self.fc2(x) + return F.log_softmax(x, dim=1) + + +def train(args, model, device, train_loader, optimizer, epoch, writer, plo): + model.train() + for batch_idx, (data, target) in enumerate(train_loader): + plo.on_step_begin() + + data, target = data.to(device), target.to(device) + optimizer.zero_grad() + output = model(data) + loss = F.nll_loss(output, target) + loss.backward() + optimizer.step() + + if batch_idx % args.log_interval == 0: + print( + "Train Epoch: {} [{}/{} ({:.0f}%)]\tloss={:.4f}".format( + epoch, + batch_idx * len(data), + len(train_loader.dataset), + 100.0 * batch_idx / len(train_loader), + loss.item(), + ) + ) + niter = epoch * len(train_loader) + batch_idx + writer.add_scalar("loss", loss.item(), niter) + plo.on_step_end() + + +def test(args, model, device, test_loader, writer, epoch): + model.eval() + test_loss = 0 + correct = 0 + with torch.no_grad(): + for data, target in test_loader: + data, target = data.to(device), target.to(device) + output = model(data) + test_loss += F.nll_loss( + output, target, reduction="sum" + ).item() # sum up batch loss + pred = output.max(1, keepdim=True)[ + 1 + ] # get the index of the max log-probability + correct += pred.eq(target.view_as(pred)).sum().item() + + test_loss /= len(test_loader.dataset) + print("\naccuracy={:.4f}\n".format(float(correct) / len(test_loader.dataset))) + writer.add_scalar("accuracy", float(correct) / len(test_loader.dataset), epoch) + + return float(correct) / len(test_loader.dataset) + + +def should_distribute(): + return dist.is_available() and WORLD_SIZE > 1 + + +def is_distributed(): + return dist.is_available() and dist.is_initialized() + + +def main(): + # Training settings + parser = argparse.ArgumentParser(description="PyTorch MNIST Example") + parser.add_argument( + "--batch-size", + type=int, + default=64, + metavar="N", + help="input batch size for training (default: 64)", + ) + parser.add_argument( + "--test-batch-size", + type=int, + default=1000, + metavar="N", + help="input batch size for testing (default: 1000)", + ) + parser.add_argument( + "--epochs", + type=int, + default=1, + metavar="N", + help="number of epochs to train (default: 10)", + ) + parser.add_argument( + "--lr", + type=float, + default=0.01, + metavar="LR", + help="learning rate (default: 0.01)", + ) + parser.add_argument( + "--momentum", + type=float, + default=0.5, + metavar="M", + help="SGD momentum (default: 0.5)", + ) + parser.add_argument( + "--no-cuda", action="store_true", default=False, help="disables CUDA training" + ) + parser.add_argument( + "--seed", type=int, default=1, metavar="S", help="random seed (default: 1)" + ) + parser.add_argument( + "--log-interval", + type=int, + default=10, + metavar="N", + help="how many batches to wait before logging training status", + ) + parser.add_argument( + "--save-model", + action="store_true", + default=False, + help="For Saving the current Model", + ) + parser.add_argument( + "--dir", + default="logs", + metavar="L", + help="directory where summary logs are stored", + ) + if dist.is_available(): + parser.add_argument( + "--backend", + type=str, + help="Distributed backend", + choices=[dist.Backend.GLOO, dist.Backend.NCCL, dist.Backend.MPI], + default=dist.Backend.GLOO, + ) + args = parser.parse_args() + use_cuda = not args.no_cuda and torch.cuda.is_available() + if use_cuda: + print("Using CUDA") + + writer = SummaryWriter(args.dir) + + torch.manual_seed(args.seed) + + device = torch.device("cuda" if use_cuda else "cpu") + + if should_distribute(): + print("Using distributed PyTorch with {} backend".format(args.backend)) + dist.init_process_group(backend=args.backend) + + kwargs = {"num_workers": 1, "pin_memory": True} if use_cuda else {} + + ##################### ZEUS INIT BEGIN ########################## + # Set up monitor, plo, bso. + monitor = ZeusMonitor(gpu_indices=[torch.cuda.current_device()]) + plo = GlobalPowerLimitOptimizer(monitor) + bso = BatchSizeOptimizer( + monitor=monitor, + server_url=os.environ.get("ZEUS_SERVER_URL"), + job=JobSpec( + job_id=os.environ.get("ZEUS_JOB_ID"), + job_id_prefix="mnist-dev", + default_batch_size=256, + batch_sizes=[32, 64, 256, 512, 1024, 4096, 2048], + max_epochs=5 + ), + ) + # Get batch size from bso + batch_size = bso.get_batch_size() + print("Chosen batach_size:", batch_size) + + ##################### ZEUS INIT END ########################## + train_loader = torch.utils.data.DataLoader( + datasets.FashionMNIST( + "../data", + train=True, + download=True, + transform=transforms.Compose( + [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))] + ), + ), + batch_size=batch_size, + shuffle=True, + **kwargs, + ) + test_loader = torch.utils.data.DataLoader( + datasets.FashionMNIST( + "../data", + train=False, + transform=transforms.Compose( + [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))] + ), + ), + batch_size=args.test_batch_size, + shuffle=False, + **kwargs, + ) + + model = Net().to(device) + + if is_distributed(): + Distributor = ( + nn.parallel.DistributedDataParallel + if use_cuda + else nn.parallel.DistributedDataParallelCPU + ) + model = Distributor(model) + + optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum) + + ################### ZEUS OPTIMIZER USAGE BEGIN ####################### + bso.on_train_begin() + + for epoch in range(1, args.epochs + 1): + plo.on_epoch_begin() + train(args, model, device, train_loader, optimizer, epoch, writer, plo) + plo.on_epoch_end() + acc = test(args, model, device, test_loader, writer, epoch,bso) + bso.on_evaluate(acc) + + ################### ZEUS OPTIMIZER USAGE END ######################### + if args.save_model: + torch.save(model.state_dict(), "mnist_cnn.pt") + + +if __name__ == "__main__": + main() diff --git a/examples/bso_server/mnist_single_gpu.yaml b/examples/bso_server/mnist_single_gpu.yaml new file mode 100644 index 00000000..eca1722c --- /dev/null +++ b/examples/bso_server/mnist_single_gpu.yaml @@ -0,0 +1,30 @@ +apiVersion: "kubeflow.org/v1" +kind: PyTorchJob +metadata: + name: pytorch-zeus-mnist + namespace: kubeflow +spec: + pytorchReplicaSpecs: + Master: + replicas: 1 + restartPolicy: OnFailure + template: + spec: + containers: + - name: pytorch + image: mnist-example + imagePullPolicy: Never + command: + - "python3" + - "/workspace/examples/bso_server/mnist_single_gpu.py" + - "--epochs=5" + env: + - name: ZEUS_SERVER_URL + value: "http://server:80" + - name: ZEUS_JOB_ID + value: "mnist-dev-1" + - name: ZEUS_LOG_LEVEL + value: "INFO" + securityContext: + capabilities: + add: ["SYS_ADMIN"] \ No newline at end of file diff --git a/examples/bso_server/run_single.py b/examples/bso_server/run_single.py new file mode 100644 index 00000000..890c01fc --- /dev/null +++ b/examples/bso_server/run_single.py @@ -0,0 +1,410 @@ +import argparse +import os +import random +import time +from enum import Enum + +import torch +import torch.nn as nn +import torch.nn.parallel +import torch.backends.cudnn as cudnn +import torch.distributed as dist +import torch.optim +from torch.optim.lr_scheduler import StepLR +import torch.utils.data +from torch.utils.data import DataLoader +import torch.utils.data.distributed +import torchvision.transforms as transforms +import torchvision.datasets as datasets +import torchvision.models as models + +# ZEUS +from zeus.monitor import ZeusMonitor +from zeus.optimizer import GlobalPowerLimitOptimizer +from zeus.optimizer.batch_size.common import JobSpec +from zeus.optimizer.power_limit import MaxSlowdownConstraint +from zeus.util.env import get_env +from zeus.optimizer.batch_size.client import BatchSizeOptimizer + + +def parse_args() -> argparse.Namespace: + """Parse command line arguments.""" + # List choices of models + model_names = sorted( + name + for name in models.__dict__ + if name.islower() + and not name.startswith("__") + and callable(models.__dict__[name]) + ) + + parser = argparse.ArgumentParser(description="PyTorch ImageNet Training") + parser.add_argument("data", metavar="DIR", help="Path to the ImageNet directory") + parser.add_argument( + "-a", + "--arch", + metavar="ARCH", + default="resnet18", + choices=model_names, + help="model architecture: " + " | ".join(model_names) + " (default: resnet18)", + ) + parser.add_argument( + "-j", + "--workers", + default=4, + type=int, + metavar="N", + help="number of data loading workers (default: 4)", + ) + parser.add_argument( + "--epochs", + default=90, + type=int, + metavar="N", + help="number of total epochs to run", + ) + parser.add_argument( + "--lr", + "--learning_rate", + default=0.1, + type=float, + metavar="LR", + help="initial learning rate", + dest="lr", + ) + parser.add_argument( + "--momentum", default=0.9, type=float, metavar="M", help="momentum" + ) + parser.add_argument( + "--wd", + "--weight_decay", + default=1e-4, + type=float, + metavar="W", + help="weight decay (default: 1e-4)", + dest="weight_decay", + ) + parser.add_argument( + "-p", + "--print_freq", + default=10, + type=int, + metavar="N", + help="print frequency (default: 10)", + ) + parser.add_argument( + "--seed", default=None, type=int, help="seed for initializing training. " + ) + parser.add_argument( + "--gpu", default=0, type=int, metavar="N", help="GPU id to use (default: 0)" + ) + + return parser.parse_args() + + +def main(): + """Main function that prepares values and spawns/calls the worker function.""" + args = parse_args() + + if args.seed is not None: + random.seed(args.seed) + torch.manual_seed(args.seed) + cudnn.deterministic = True + + print("=> creating model '{}'".format(args.arch)) + model = models.__dict__[args.arch]() + + torch.cuda.set_device(args.gpu) + model.cuda(args.gpu) + + criterion = nn.CrossEntropyLoss().cuda(args.gpu) + optimizer = torch.optim.SGD( + model.parameters(), + args.lr, + momentum=args.momentum, + weight_decay=args.weight_decay, + ) + scheduler = StepLR(optimizer, step_size=30, gamma=0.1) + + traindir = os.path.join(args.data, "train") + valdir = os.path.join(args.data, "val") + normalize = transforms.Normalize( + mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225] + ) + train_dataset = datasets.ImageFolder( + traindir, + transforms.Compose( + [ + transforms.RandomResizedCrop(224), + transforms.RandomHorizontalFlip(), + transforms.ToTensor(), + normalize, + ] + ), + ) + val_dataset = datasets.ImageFolder( + valdir, + transforms.Compose( + [ + transforms.Resize(256), + transforms.CenterCrop(224), + transforms.ToTensor(), + normalize, + ] + ), + ) + + ################################## The important part ##################################### + + # ZeusMonitor is used to profile the time and energy consumption of the GPU. + monitor = ZeusMonitor(gpu_indices=[args.gpu]) + + # GlobalPowerLimitOptimizer profiles each power limit and selects the best one. + # This is the power limit optimizer that's in the Zeus paper. + plo = GlobalPowerLimitOptimizer( + monitor=monitor, + optimum_selector=MaxSlowdownConstraint( + factor=get_env("ZEUS_MAX_SLOWDOWN", float, 1.1), + ), + warmup_steps=10, + profile_steps=40, + pl_step=25, + ) + + bso = BatchSizeOptimizer( + monitor=monitor, + server_url="http://127.0.0.1:8000", + job=JobSpec( + job_id="1", + job_id_prefix="imagenet-dev", + default_batch_size=256, + batch_sizes=[32, 64, 128, 256, 512, 1024, 4096, 2048], + ), + ) + + batch_size = bso.get_batch_size() + + train_loader = DataLoader( + train_dataset, + batch_size=batch_size, + shuffle=True, + num_workers=args.workers, + pin_memory=True, + ) + val_loader = DataLoader( + val_dataset, + batch_size=batch_size, + shuffle=False, + num_workers=args.workers, + pin_memory=True, + ) + bso.on_train_begin() + + for epoch in range(args.epochs): + plo.on_epoch_begin() + train(train_loader, model, criterion, optimizer, epoch, args, plo) + plo.on_epoch_end() + + acc1 = validate(val_loader, model, criterion, args) + print(f"Top-1 accuracy: {acc1}") + + bso.on_evaluate(acc1) + scheduler.step() + ################################## The important part ##################################### + + +def train( + train_loader, model, criterion, optimizer, epoch, args, power_limit_optimizer +): + batch_time = AverageMeter("Time", ":6.3f") + data_time = AverageMeter("Data", ":6.3f") + losses = AverageMeter("Loss", ":.4e") + top1 = AverageMeter("Acc@1", ":6.2f") + top5 = AverageMeter("Acc@5", ":6.2f") + + progress = ProgressMeter( + len(train_loader), + [batch_time, data_time, losses, top1, top5], + prefix="Epoch: [{}]".format(epoch), + ) + + # switch to train mode + model.train() + + end = time.time() + for i, (images, target) in enumerate(train_loader): + power_limit_optimizer.on_step_begin() # Mark the beginning of one training step. + + # Load data to GPU + images = images.cuda(args.gpu, non_blocking=True) + target = target.cuda(args.gpu, non_blocking=True) + + # measure data loading time + data_time.update(time.time() - end) + + # compute output + output = model(images) + loss = criterion(output, target) + + # measure accuracy and record loss + acc1, acc5 = accuracy(output, target, topk=(1, 5)) + losses.update(loss.item(), images.size(0)) + top1.update(acc1[0], images.size(0)) + top5.update(acc5[0], images.size(0)) + + # compute gradient and do SGD step + optimizer.zero_grad() + loss.backward() + optimizer.step() + + power_limit_optimizer.on_step_end() # Mark the end of one training step. + + # measure elapsed time + batch_time.update(time.time() - end) + end = time.time() + + if i % args.print_freq == 0: + progress.display(i + 1) + + +def validate(val_loader, model, criterion, args): + + batch_time = AverageMeter("Time", ":6.3f", Summary.NONE) + losses = AverageMeter("Loss", ":.4e", Summary.NONE) + top1 = AverageMeter("Acc@1", ":6.2f", Summary.AVERAGE) + top5 = AverageMeter("Acc@5", ":6.2f", Summary.AVERAGE) + progress = ProgressMeter( + len(val_loader), + [batch_time, losses, top1, top5], + prefix="Test: ", + ) + + # switch to evaluate mode + model.eval() + + with torch.no_grad(): + end = time.time() + for i, (images, target) in enumerate(val_loader): + # Load data to GPU + images = images.cuda(args.gpu, non_blocking=True) + target = target.cuda(args.gpu, non_blocking=True) + + # compute output + output = model(images) + loss = criterion(output, target) + + # measure accuracy and record loss + acc1, acc5 = accuracy(output, target, topk=(1, 5)) + losses.update(loss.item(), images.size(0)) + top1.update(acc1[0], images.size(0)) + top5.update(acc5[0], images.size(0)) + + # measure elapsed time + batch_time.update(time.time() - end) + end = time.time() + + if i % args.print_freq == 0: + progress.display(i + 1) + + progress.display_summary() + + return top1.avg + + +class Summary(Enum): + NONE = 0 + AVERAGE = 1 + SUM = 2 + COUNT = 3 + + +class AverageMeter(object): + """Computes and stores the average and current value""" + + def __init__(self, name, fmt=":f", summary_type=Summary.AVERAGE): + self.name = name + self.fmt = fmt + self.summary_type = summary_type + self.reset() + + def reset(self): + self.val = 0 + self.avg = 0 + self.sum = 0 + self.count = 0 + + def update(self, val, n=1): + self.val = val + self.sum += val * n + self.count += n + self.avg = self.sum / self.count + + # DATA PARALLEL + def all_reduce(self): + device = "cuda" if torch.cuda.is_available() else "cpu" + total = torch.tensor([self.sum, self.count], dtype=torch.float32, device=device) + dist.all_reduce(total, dist.ReduceOp.SUM, async_op=False) + self.sum, self.count = total.tolist() + self.avg = self.sum / self.count + + def __str__(self): + fmtstr = "{name} {val" + self.fmt + "} ({avg" + self.fmt + "})" + return fmtstr.format(**self.__dict__) + + def summary(self): + fmtstr = "" + if self.summary_type is Summary.NONE: + fmtstr = "" + elif self.summary_type is Summary.AVERAGE: + fmtstr = "{name} {avg:.3f}" + elif self.summary_type is Summary.SUM: + fmtstr = "{name} {sum:.3f}" + elif self.summary_type is Summary.COUNT: + fmtstr = "{name} {count:.3f}" + else: + raise ValueError("invalid summary type %r" % self.summary_type) + + return fmtstr.format(**self.__dict__) + + +class ProgressMeter(object): + def __init__(self, num_batches, meters, prefix=""): + self.batch_fmtstr = self._get_batch_fmtstr(num_batches) + self.meters = meters + self.prefix = prefix + + def display(self, batch): + entries = [self.prefix + self.batch_fmtstr.format(batch)] + entries += [str(meter) for meter in self.meters] + print("\t".join(entries)) + + def display_summary(self): + entries = [" *"] + entries += [meter.summary() for meter in self.meters] + print(" ".join(entries)) + + def _get_batch_fmtstr(self, num_batches): + num_digits = len(str(num_batches // 1)) + fmt = "{:" + str(num_digits) + "d}" + return "[" + fmt + "/" + fmt.format(num_batches) + "]" + + +def accuracy(output, target, topk=(1,)): + """Computes the accuracy over the k top predictions for the specified values of k""" + with torch.no_grad(): + maxk = max(topk) + batch_size = target.size(0) + + _, pred = output.topk(maxk, 1, True, True) + pred = pred.t() + correct = pred.eq(target.view(1, -1).expand_as(pred)) + + res = [] + for k in topk: + correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True) + res.append(correct_k.mul_(100.0 / batch_size)) + return res + + +if __name__ == "__main__": + main() diff --git a/examples/trace_driven/README.md b/examples/trace_driven/README.md index c9cb5f32..d66f3300 100644 --- a/examples/trace_driven/README.md +++ b/examples/trace_driven/README.md @@ -4,12 +4,12 @@ While the existence of recurring jobs in production GPU clusters is clear, it is Thus, Zeus provides a trace-driven simulator that allows users to plug in their own customized batch size optimizer and power limit optimizers and observe gains. We provide two types of traces. + 1. Train trace: We trained six different (model, dataset) pairs with many different batch sizes. And we repeated training at least four times for each triplet with different random seeds. Thus, when we would like to know the result of training a model on a dataset with a certain batch size, we can sample a *training path* from this trace. 2. Power trace: We profiled the the duration of one epoch and average power consumption for six (model, dataset) pairs with many different (batch size, power limit) configurations. These results not stochastic, and can be fetched from the trace to construct TTA (time to accuracy) and ETA (energy to accuracy) values. Refer to the [`trace`](../../trace/) directory for more information about the traces we provide. - ## Simulating the recurrence of one job With [`run_single.py`](run_single.py), you can simulate the optimization trajectory of one recurring job. @@ -37,7 +37,6 @@ python run_single.py \ --seed 1 ``` - ## Simulating jobs based on the Alibaba GPU cluster trace With [`run_alibaba.py`](run_alibaba.py), you can simulate jobs in the [Alibaba GPU cluster trace](https://github.com/alibaba/clusterdata/tree/master/cluster-trace-gpu-v2020). diff --git a/examples/trace_driven/run_alibaba.py b/examples/trace_driven/run_alibaba.py index 75e7a499..946f925b 100644 --- a/examples/trace_driven/run_alibaba.py +++ b/examples/trace_driven/run_alibaba.py @@ -26,9 +26,9 @@ import pandas as pd from zeus.job import Job -from zeus.simulate import Simulator +from zeus._legacy.simulate import Simulator from zeus.analyze import HistoryEntry -from zeus.policy import JITPowerLimitOptimizer, PruningGTSBatchSizeOptimizer +from zeus._legacy.policy import JITPowerLimitOptimizer, PruningGTSBatchSizeOptimizer def parse_args() -> argparse.Namespace: diff --git a/examples/trace_driven/run_single.py b/examples/trace_driven/run_single.py index c747143f..8d4e24fa 100644 --- a/examples/trace_driven/run_single.py +++ b/examples/trace_driven/run_single.py @@ -23,8 +23,11 @@ import pandas as pd from zeus.job import Job -from zeus.policy.optimizer import JITPowerLimitOptimizer, PruningGTSBatchSizeOptimizer -from zeus.simulate import Simulator +from zeus._legacy.policy.optimizer import ( + JITPowerLimitOptimizer, + PruningGTSBatchSizeOptimizer, +) +from zeus._legacy.simulate import Simulator from zeus.analyze import HistoryEntry diff --git a/mkdocs.yml b/mkdocs.yml index 7bbd320e..fb863ab1 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -92,7 +92,11 @@ markdown_extensions: - pymdownx.emoji: emoji_index: !!python/name:material.extensions.emoji.twemoji emoji_generator: !!python/name:material.extensions.emoji.to_svg - + - pymdownx.superfences: + custom_fences: + - name: mermaid + class: mermaid + format: !!python/name:pymdownx.superfences.fence_code_format # Appearance extra_css: - assets/css/custom.css @@ -113,6 +117,8 @@ nav: - getting_started/index.md - Environment Setup: getting_started/environment.md - Installing Zeus: getting_started/installing.md + - Batch Size Optimizer: + - batch_size_optimizer/index.md - Perseus: - perseus/index.md - Integrating: perseus/integrating.md diff --git a/pyproject.toml b/pyproject.toml index cd654be6..92ce8a67 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -39,9 +39,13 @@ Documentation = "https://ml.energy/zeus" [project.optional-dependencies] # One day FastAPI will drop support for Pydantic V1. Then fastapi has to be pinned as well. perseus = ["fastapi[all]", "pydantic<2", "lowtime", "aiofiles", "httpx"] +bso = ["pydantic<2", "httpx"] +bso-server = ["fastapi[all]","sqlalchemy","pydantic<2"] +migration = ["alembic", "sqlalchemy", "pydantic<2", "python-dotenv"] lint = ["ruff", "black==22.6.0"] -test = ["pytest==7.3.2", "pytest-mock==3.10.0", "pytest-xdist==3.3.1"] -dev = ["zeus-ml[perseus,lint,test]"] +test = ["fastapi[all]","sqlalchemy","pydantic<2", "httpx", "pytest==7.3.2", "pytest-mock==3.10.0", "pytest-xdist==3.3.1", "anyio==3.7.1", "aiosqlite==0.20.0"] +dev = ["zeus-ml[perseus,lint,test]", "greenlet"] +# greenlet is for supporting apple mac silicon for sqlalchemy(https://docs.sqlalchemy.org/en/20/faq/installation.html) [tool.setuptools.packages.find] where = ["."] @@ -81,6 +85,11 @@ pydocstyle.convention = "google" "zeus/optimizer/perseus/common.py" = ["N805"] "zeus/optimizer/perseus/server/router.py" = ["B008"] "zeus/util/pydantic_v1.py" = ["F403"] +"zeus/optimizer/batch_size/**/commands.py" = ["N805"] +"zeus/optimizer/batch_size/**/models.py" = ["N805"] +"zeus/optimizer/batch_size/server/config.py" = ["N805"] +"zeus/optimizer/batch_size/server/router.py" = ["B008"] +"zeus/optimizer/batch_size/common.py" = ["N805"] "zeus/device/gpu.py" = ["N802", "N803"] [tool.pytest.ini_options] diff --git a/tests/optimizer/batch_size/conftest.py b/tests/optimizer/batch_size/conftest.py new file mode 100644 index 00000000..9ca93bd7 --- /dev/null +++ b/tests/optimizer/batch_size/conftest.py @@ -0,0 +1,92 @@ +import asyncio +import os +from typing import AsyncIterator + +import pytest + +os.environ["ZEUS_BSO_DATABASE_URL"] = ( + "sqlite+aiosqlite:///dummy.db" # To prevent pydantic setting error without .env file +) +from fastapi.testclient import TestClient +from sqlalchemy.ext.asyncio.session import AsyncSession +from zeus.optimizer.batch_size.server.database.db_connection import ( + DatabaseSessionManager, + get_db_session, +) +from zeus.optimizer.batch_size.server.database.schema import Base +from zeus.optimizer.batch_size.server.router import app + + +def pytest_configure(): + # Test wide global helper functions. + + def get_fake_job(job_id: str) -> dict: + return { + "job_id": job_id, + "job_id_prefix": "test", + "seed": 1, + "default_batch_size": 1024, + "batch_sizes": [32, 64, 256, 512, 1024, 4096, 2048], + "eta_knob": 0.5, + "beta_knob": 2, + "target_metric": 0.5, + "higher_is_better_metric": True, + "max_epochs": 100, + "num_pruning_rounds": 2, + "window_size": 5, + "mab_prior_mean": 0, + "mab_prior_precision": 0, + "mab_seed": 123456, + "mab_num_explorations": 2, + } + + def get_fake_job_config(job_id: str) -> dict: + fake_job_config = get_fake_job(job_id) + fake_job_config["max_power"] = 3000 + fake_job_config["number_of_gpus"] = 4 + fake_job_config["gpu_model"] = "A100" + return fake_job_config + + pytest.get_fake_job = get_fake_job + pytest.get_fake_job_config = get_fake_job_config + + +def init(db_url: str): + sessionmanager = DatabaseSessionManager( + f"sqlite+aiosqlite:///{db_url}", {"echo": False} + ) + + async def override_db_session() -> AsyncIterator[AsyncSession]: + async with sessionmanager.session() as session: + yield session + + app.dependency_overrides[get_db_session] = override_db_session + return sessionmanager + + +async def create(sessionmanager: DatabaseSessionManager): + print("Create tables") + async with sessionmanager._engine.begin() as conn: + await conn.run_sync(Base.metadata.create_all) + + +async def clean(sessionmanager: DatabaseSessionManager): + print("Clean") + async with sessionmanager._engine.begin() as conn: + await conn.run_sync(Base.metadata.drop_all) + + +@pytest.fixture +def client(): + with TestClient(app) as c: + yield c + + +# For each worker, set up db +@pytest.fixture(scope="module", autouse=True) +def session_data(tmp_path_factory, worker_id): + root_tmp_dir = tmp_path_factory.getbasetemp().parent + sm = init(str(root_tmp_dir / f"test-{worker_id}.db")) + asyncio.run(clean(sm)) + asyncio.run(create(sm)) + return diff --git a/tests/optimizer/batch_size/simulate_with_server.py b/tests/optimizer/batch_size/simulate_with_server.py new file mode 100644 index 00000000..0963734d --- /dev/null +++ b/tests/optimizer/batch_size/simulate_with_server.py @@ -0,0 +1,470 @@ +# Copyright (C) 2023 Jae-Won Chung +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""A simulator for running trace-driven Zeus experiments.""" + +from __future__ import annotations + +from copy import deepcopy +from typing import Literal + +import httpx +import numpy as np +import pandas as pd +from zeus._legacy.policy import PowerLimitOptimizer +from zeus.analyze import HistoryEntry +from zeus.job import Job +from zeus.optimizer.batch_size.common import ( + GET_NEXT_BATCH_SIZE_URL, + REGISTER_JOB_URL, + REPORT_RESULT_URL, + JobSpecFromClient, + TrainingResult, +) +from zeus.util import zeus_cost + + +class BatchSizeOptimizerDummyClient: + def __init__(self, url=""): + self.url = url + self.trial_number = 0 + + def register_job(self, job: JobSpecFromClient): + res = httpx.post(self.url + REGISTER_JOB_URL, content=job.json()) + assert res.status_code == 200 or res.status_code == 201, res.text + + def predict(self, job_id: str): + res = httpx.get(self.url + GET_NEXT_BATCH_SIZE_URL, params={"job_id": job_id}) + bs = res.json()["batch_size"] + self.trial_number = res.json()["trial_number"] + return bs + + def observe( + self, + job: JobSpecFromClient, + batch_size: int, + total_energy: float, + time: float, + max_power: int, + converged: bool, + epoch: int, + ): + training_result = TrainingResult( + job_id=job.job_id, + batch_size=batch_size, + trial_number=self.trial_number, + error=False, + time=time, + energy=total_energy, + max_power=max_power, + metric=job.target_metric + 1 if converged else job.target_metric - 1, + current_epoch=epoch, + ) + + # report to the server about the result of this training + res = httpx.post(self.url + REPORT_RESULT_URL, content=training_result.json()) + + +# ruff: noqa: PLR0912, PLR0915 +class SimulatorWithServer: + """Simulates job execution optimized by Zeus.""" + + def __init__( + self, + summary_train: str | pd.DataFrame, + summary_power: str | pd.DataFrame, + power_limit_optimizer: PowerLimitOptimizer, + gpu: Literal["a40", "v100", "p100", "rtx6000"], + seed: int = 123456, + verbose: bool = True, + ) -> None: + """Initialize the simulator. + + Args: + summary_train: Path to or `pd.DataFrame` of the train trace. + summary_power: Path to or `pd.DataFrame` of the power trace. + batch_size_optimizer: The user is expected to construct + the BSO with the desired policy and pass it into the simulator. + power_limit_optimizer: The user is expected to construct + the PLO with the desired policy and pass it into the simulator. + seed: The random seed. Every invocation of any simulation method in this + class is deterministic given the random seed, because the internal RNG is + deepcopied before running the simulation. + verbose: Whether to log out the internal states of the simulator. + """ + # Generate relevant data. + train_df = ( + pd.read_csv(summary_train) + if isinstance(summary_train, str) + else summary_train + ) + power_df = ( + pd.read_csv(summary_power) + if isinstance(summary_power, str) + else summary_power + ) + df = train_df.merge(power_df, how="inner") # type: ignore + df["TTA"] = df.target_epoch * df.time_per_epoch + df["ETA"] = df.TTA * df.average_power + # 'energy_per_epoch' is used to compare different power limits with the same batch size + # when trying to figure out which power limit is the best one. + df["energy_per_epoch"] = df.time_per_epoch * df.average_power + self.df = df + + # Knob optimizers. + # self.bso = batch_size_optimizer + self.plo = power_limit_optimizer + self.bso = BatchSizeOptimizerDummyClient() + + # Save arguments. + self.seed = seed + self.verbose = verbose + self.gpu = gpu + + def simulate_one_job( + self, + job: Job, # Use this to create a job batch_sizes = self._profile_batch_size_range(job) + num_recurrence: int, + beta_knob: float, + eta_knob: float, + ) -> list[HistoryEntry]: + r"""Simulate a sequentially recurring job. Explore with early stopping. + + Args: + job: Job spec to simulate. + num_recurrence: How many times the job recurs. + beta_knob: `beta_knob * min_eta` is the early stopping cost threshold. + Set to `np.inf` to disable early stopping. + eta_knob: $\eta$ used in the hybrid cost metric. + $\textrm{cost} = \eta \cdot \textrm{ETA} + (1 - \eta) \cdot \textrm{MaxPower} \cdot \textrm{TTA}$ + + Returns: + A list of [`HistoryEntry`][zeus.analyze.HistoryEntry] objects for each job run. + """ + # Figure out MAXPOWER. + max_power = self.df.power_limit.max().item() + if self.verbose: + print(f"[Simulator] Max power = {max_power}W") + + # Copy all internal state so that simulation does not modify any + # internal state and is deterministic w.r.t. the random seed. + # A new job. Profile the feasible batch size range. + jobSpec = JobSpecFromClient( + job_id="simulation-one-job", + job_id_prefix="simulation", + batch_sizes=self._profile_batch_size_range(job), + default_batch_size=job.default_bs, + target_metric=job.target_metric, + max_epochs=job.max_epochs, + beta_knob=beta_knob, + eta_knob=eta_knob, + mab_seed=self.seed, + max_power=max_power, + gpu_model=self.gpu, + number_of_gpus=1, + window_size=0, + ) + + # register job in the server + self.bso.register_job(jobSpec) + + ## Should be mocked + plo = deepcopy(self.plo) + rng = np.random.default_rng(self.seed) + + # Track the minimum cost observed for the early stopping energy threshold. + min_cost = np.inf + + # Simulate each job one at a time. + history: list[HistoryEntry] = [] + + if self.verbose: + print(f"[Simulator] {job} recurring {num_recurrence} times.") + + # Job recurs. + for i in range(num_recurrence): + if self.verbose: + print(f"\nRecurrence: {i+1}") + + # Run the job until convergence. Upper bound the number of retries to 20. + # Accumulate the cost of retries before convergence. + cost_acc = 0.0 + for tries in range(1, 21): + # Whether this run of the job needed to profile power. + profiled_power = False + + # Fetch knobs to use. + bs = self.bso.predict(jobSpec.job_id) + pl = plo.predict(job, bs) + + # When the batch size is first explored, we need to profile power limit. + if pl is None: + profiled_power = True + result = self._profile_power_limit(job, bs, eta_knob) + for pl, epe in result.items(): + plo.observe(job, bs, pl, epe) + pl = plo.predict(job, bs) + assert pl + + # Run the job, potentially early stopping it. + eta, tta, reached, epoch = self._run_job( + job=job, + batch_size=bs, + power_limit=pl, + rng=rng, + cost_ub=beta_knob * min_cost, + eta_knob=eta_knob, + profile_power=profiled_power, + ) + + # The job never ran because even one epoch exceeds the cost threshold. + # Let the BSO observe that this batch size is bad, but since the job + # did not run, do not add to the history and retry. + if eta == 0 and tta == 0 and not reached: + # bso.observe(job, bs, 100 * beta_knob * min_cost, False) + self.bso.observe(jobSpec, bs, eta, tta, max_power, False, epoch) + continue + + # Compute the hybrid cost metric. + cost = zeus_cost(eta, tta, eta_knob, max_power) + cost_acc += cost + + # Provide feedback to the BSO. + # bso.observe(job, bs, cost, reached) + self.bso.observe(jobSpec, bs, eta, tta, max_power, reached, epoch) + + # Record history for analysis. + history.append(HistoryEntry(bs, pl, eta, reached, tta)) + + # Reached the target metric. Update min_cost and go on to the next recurrence. + if reached: + if self.verbose: + print() + print( + f"[Simulator] Reached target metric in {tries} {'try' if tries == 1 else 'tries'}." + ) + if min_cost > cost_acc: + if self.verbose: + print( + f"[Simulator] Minimum cost updated from {min_cost:.2f} to {cost_acc:.2f}." + ) + min_cost = cost_acc + break + # Didn't reach the target metric. + # We assume that the default BS (set by the user) will always converge. + # That is, reaching the target metric with the model should be a feasible task. + if i == 0: + raise RuntimeError( + f"The default batch size {job.default_bs} did not converge." + ) + + # Target metric was not reached in 20 tries. We consider this target metric to be unreachable. + else: + raise RuntimeError("Job did not reach the target metric in 20 tries.") + + if self.verbose: + print() + print( + f"[Simulator] {job} (BS, PL, ETA, whether_reached, TTA) history: \n{history}" + ) + + return history + + def _run_job( + self, + job: Job, + batch_size: int, + power_limit: int, + rng: np.random.Generator, + cost_ub: float, + eta_knob: float, + profile_power: bool, + ) -> tuple[float, float, bool, int]: + r"""Simulate running the job and return the energy consumed and whether it converged. + + This method will randomly choose one of the possible training "paths". Then, + based on cost_ub, it will compute the maximum number of epochs the job can run. + If the path's target_epoch is smaller than or equal to the maximum number of + epochs, the cost incurred until target_epoch is returned. Otherwise, the cost + incurred until the maximum number of epochs is returned. + + It is important to note that the job may never run when the first epoch's cost + is already expected to exceed the cost upper bound. In such a case, the returned + time and energy consumptions will be zero. This case must be treated separately + in the calling code. + + If profile_power is True, the first epoch will JIT-profile power limits coarsely + by dividing the epoch evenly into len(available_power_limits) pieces. Thus the + the first epoch's energy and time consumption will be slightly adjusted. + + Args: + job: Job spec to run. + batch_size: Batch size to use. + power_limit: Power limit to use. Regardless of whether this run of this + batch size requires power profiling, the simulator will input the optimal + power limit for the batch size. The first epoch energy consumption from + power profiling is adjusted in the latter half of this method based on the + profile_power flag. + rng: Random number generator used to sample one training path. + cost_ub: Cost upper bound. The job is terminated when the next epoch is going + to exceed the cost upper bound. + eta_knob: $\eta$ used in the hybrid cost metric. + $\textrm{cost} = \eta \cdot \textrm{ETA} + (1 - \eta) \cdot \textrm{MaxPower} \cdot \textrm{TTA}$ + profile_power: Whether this run of the job should profile power during the + first epoch. + + Returns: + Tuple of energy consumption, time consumption, whether the job reached the target metric, and max_epochs indicating how many epochs we ran. + """ + # df is filtered with job spec, BS, and PL. We sample one possible training path. + # power_df is filtered with job spec and BS. We use this to compute the energy + # consumption of profiling power during the first epoch. + df = job.filter_df(self.df) + power_df = df.loc[df.batch_size == batch_size] + df = power_df.loc[df.power_limit == power_limit] + path = df.sample(n=1, random_state=rng) + + # Max number of epochs is bound by either the cost upper bound or the user-specified + # max_epochs, whichever is smaller. + if cost_ub == np.inf: + # cost_ub is infinity in two cases: + # 1. The simulator has never observed any cost value in the early part of simulation. + # 2. We're simulating with no early stopping, i.e. beta_knob is infinity. + max_epochs = job.max_epochs + if self.verbose: + print(f"[run job] Cost UB is inf. {max_epochs=}") + else: + # Stop right before the first epoch when cost will cross the upper bound. + cost_per_epoch = ( + eta_knob * path.energy_per_epoch.item() + + (1 - eta_knob) + * power_df.power_limit.max().item() + * path.time_per_epoch.item() + ) + max_epochs = min(cost_ub // cost_per_epoch, job.max_epochs) + if self.verbose: + print(f"[run job] {cost_ub=}") + print(f"[run job] {cost_per_epoch=}") + print(f"[run job] {max_epochs=}") + + def compute_energy_and_time( + num_epochs: int, profile_power: bool + ) -> tuple[float, float]: + """Compute the energy and time consumed for running the job for num_epochs.""" + # This is the first run of this batch size, and we need to profile power + # during the first epoch. + if profile_power: + # Note that power_df holds rows with all power limits. Evenly splitting the + # epochs with the number of samples and running each slice with each power + # limit consumes (1/N) * e_100 + (1/N) * e_125 + ... + (1/N) * e_250. + # Also there are all runs 1, 2, ... included, but power info is actually + # completely duplicated across different runs in the DataFrame. + # Thus, taking the mean across the entire power_df gets us what we want. + energy_first_epoch = power_df.energy_per_epoch.mean().item() + energy_from_second_epoch = path.energy_per_epoch.item() * ( + num_epochs - 1 + ) + energy_consumption = energy_first_epoch + energy_from_second_epoch + time_first_epoch = power_df.time_per_epoch.mean().item() + time_from_second_epoch = path.time_per_epoch.item() * (num_epochs - 1) + time_consumption = time_first_epoch + time_from_second_epoch + # Just run num_epochs with the given power limit. Simple. + else: + energy_consumption = path.energy_per_epoch.item() * num_epochs + time_consumption = path.time_per_epoch.item() * num_epochs + return energy_consumption, time_consumption + + # The job virtually never ran. Time and Energy being zero will be treated specially outside. + # If the min_cost is so low, this might even prevent this BS from running at all. + if max_epochs == 0: + eta, tta = compute_energy_and_time(max_epochs + 1, profile_power) + print( + f"[run job] {job} cannot run even one epoch without exceeding the cost UB." + f" BS {batch_size}, PL {power_limit}, {eta_knob=}" + ) + return eta, tta, False, max_epochs + 1 + + # Job reached target metric. + target_epoch = path.target_epoch.item() + if path.target_epoch.notnull().item() and target_epoch <= max_epochs: + eta, tta = compute_energy_and_time(target_epoch, profile_power) + if self.verbose: + print( + f"[run job] {job} @ {batch_size},{power_limit}W{' prof' if profile_power else ''} " + f"=> \033[31mReached in {int(target_epoch)} epochs, " + f"TTA {tta:.2f} seconds, ETA {eta:.2f}\033[0m" + ) + return eta, tta, True, max_epochs + + # Job failed to reach the target metric. + energy_consumption, time_consumption = compute_energy_and_time( + max_epochs, profile_power + ) + if self.verbose: + print( + f"[run job] {job} @ {batch_size},{power_limit}W{' prof' if profile_power else ''} " + f"=> \033[31mFailed (stopped after {int(max_epochs)} epochs), " + f"TTA {time_consumption:.2f} seconds, ETA {energy_consumption:.2f}\033[0m" + ) + return ( + energy_consumption, + time_consumption, + False, + job.max_epochs, + ) # reached max epoch or the next epoch will reach cost ub. -> server will give another chance if epoch is less than max epoch + + def _profile_power_limit( + self, job: Job, batch_size: int, eta_knob: float + ) -> dict[int, float]: + """Simulate running the job and profiling the power limit. + + Returns: + Dictionary mapping PL to `energy_per_epoch`. PL is inserted in high to low order. + """ + # Filter by job spec and BS. + df = job.filter_df(self.df) + df = df.loc[(df.batch_size == batch_size)] + + # Compute the epoch cost of each power limit (Equation 7). + max_pl = df.power_limit.max().item() + df = df.groupby(["power_limit"], as_index=False).mean(numeric_only=True) + df["epoch_cost"] = ( + eta_knob * df["average_power"] + (1 - eta_knob) * max_pl + ) * df["time_per_epoch"] + + # We'll be profiling energy from larger to smaller power limits. + df = df.sort_values(by="power_limit", ascending=False) + result = {rec.power_limit: rec.epoch_cost for rec in df.to_records(index=False)} + if self.verbose: + print(f"[PL profile] {job} @ {batch_size} => PL = {min(result, key=result.get)}W") # type: ignore + return result + + def _profile_batch_size_range(self, job: Job) -> list[int]: + """Simulate profiling the available batch size range of the job. + + Returns: + A list of feasible batch sizes. + """ + df = self.df + # Do not filter by target_metric here since we do not want to constrain + # the feasible batch size range to only those that reached the target metric. + df = df.loc[ + (df.dataset == job.dataset) + & (df.network == job.network) + & (df.optimizer == job.optimizer) + ] + result = sorted(list(df.batch_size.unique())) + if self.verbose: + print(f"[BS profile] {job} => BS = {result}") + return result diff --git a/tests/optimizer/batch_size/test_client.py b/tests/optimizer/batch_size/test_client.py new file mode 100644 index 00000000..19807de7 --- /dev/null +++ b/tests/optimizer/batch_size/test_client.py @@ -0,0 +1,154 @@ +from copy import deepcopy +from unittest.mock import MagicMock + +import pytest +from pytest_mock import MockerFixture +from zeus.monitor.energy import Measurement, ZeusMonitor +from zeus.optimizer.batch_size.client import BatchSizeOptimizer +from zeus.optimizer.batch_size.common import JobSpec +from zeus.optimizer.batch_size.exceptions import ZeusBSOBadOperationError + + +@pytest.fixture +def mock_monitor(mocker: MockerFixture): + mocker.patch("pynvml.nvmlInit") + + zeus_monitor_mock_instance = MagicMock(spec=ZeusMonitor) + zeus_monitor_mock_instance.nvml_gpu_indices = [0, 1, 2, 3] + zeus_monitor_mock_instance.gpu_indices = [0, 1, 2, 3] + zeus_monitor_mock_instance.end_window.return_value = Measurement( + time=37.24807469360, + energy={ + 0: 4264.87199999392, + 1: 4367.186999991536, + 2: 4342.869000002742, + 3: 4158.034000009298, + }, + ) + + mocker.patch( + "zeus.monitor.energy.ZeusMonitor", return_value=zeus_monitor_mock_instance + ) + mocker.patch("pynvml.nvmlDeviceGetName").return_value = "Tesla V100" + mocker.patch("pynvml.nvmlDeviceGetHandleByIndex").return_value = 0 + mocker.patch("pynvml.nvmlDeviceGetPowerManagementLimitConstraints").return_value = [ + 100000, + 300000, + ] + return zeus_monitor_mock_instance + + +@pytest.fixture(autouse=True) +def mock_http_call(client, mocker: MockerFixture): + mocker.patch("httpx.post", side_effect=client.post) + mocker.patch("httpx.get", side_effect=client.get) + mocker.patch("httpx.patch", side_effect=client.patch) + + mocker.patch("atexit.register") + + +def test_register_job(mock_monitor): + job = JobSpec.parse_obj(pytest.get_fake_job("test_register_job")) + bso_client = BatchSizeOptimizer(mock_monitor, "", job) + assert bso_client.job.max_power == 300 * len(mock_monitor.gpu_indices) + bso_client = BatchSizeOptimizer(mock_monitor, "", job) + assert bso_client.job.max_power == 300 * len(mock_monitor.gpu_indices) + + +def test_batch_sizes(mock_monitor): + job = JobSpec.parse_obj(pytest.get_fake_job("test_batch_sizes")) + bso_client = BatchSizeOptimizer(mock_monitor, "", job) + bs = bso_client.get_batch_size() + + assert bs == 1024 and bso_client.current_batch_size == 1024 + + bso_client.on_train_begin() + bso_client.on_evaluate(0.1) + bso_client.on_evaluate(0.2) + bso_client.on_evaluate(0.6) # Converged + + bso_client = BatchSizeOptimizer(mock_monitor, "", job) + bs = bso_client.get_batch_size() + bso_client.on_train_begin() + + assert bs == 512 and bso_client.current_batch_size == 512 + + i = 0 + with pytest.raises(Exception) as e_info: + while i < job.max_epochs - 10: # Test Early stop + bso_client.on_evaluate(0.3) + i += 1 + assert i == bso_client.cur_epoch + assert bso_client.current_batch_size == 512 + + assert str(e_info.value).find("cost upper bound") != -1 + + bso_client = BatchSizeOptimizer(mock_monitor, "", job) + bs = bso_client.get_batch_size() + bso_client.on_train_begin() + + assert bs == 2048 and bso_client.current_batch_size == 2048 + + +def test_converge_fail(mock_monitor): + job = JobSpec.parse_obj(pytest.get_fake_job("test_converge_fail")) + job.beta_knob = None # disable early stop + bso_client = BatchSizeOptimizer(mock_monitor, "", job) + bso_client.on_train_begin() + bs = bso_client.get_batch_size() + + assert bs == 1024 and bso_client.current_batch_size == 1024 + + i = 0 + with pytest.raises(Exception) as e_info: + while i < job.max_epochs + 10: # Fail after max_epoch + bso_client.on_evaluate(0.3) + i += 1 + assert i == bso_client.cur_epoch + assert bso_client.current_batch_size == 1024 + + print(e_info.value, i) + assert str(e_info.value).find("Train failed to converge within max_epoch") != -1 + + bso_client = BatchSizeOptimizer(mock_monitor, "", job) + bs = bso_client.get_batch_size() + bso_client.on_train_begin() + + assert bs == 2048 and bso_client.current_batch_size == 2048 + + +def test_distributed_setting(mock_monitor): + job = JobSpec.parse_obj(pytest.get_fake_job("test_distributed_setting")) + NGPU = 4 + bso_clients = [ + BatchSizeOptimizer(mock_monitor, "", job, rank=i) for i in range(NGPU) + ] + + # Only rank=0 can ask for bs + for i in range(1, NGPU): + with pytest.raises(ZeusBSOBadOperationError) as e_info: + bso_clients[i].get_batch_size() + + bs = bso_clients[0].get_batch_size() + # distribute batch size to other clients + for i in range(1, NGPU): + bso_clients[i].current_batch_size = bs + bso_clients[i].trial_number = bso_clients[0].trial_number + + # Mark as unconverged from rank = 0 client + i = 0 + with pytest.raises(Exception) as e_info: + while i < job.max_epochs + 10: # Fail after max_epoch + bso_clients[0].on_evaluate(0.3) + i += 1 + assert i == bso_clients[0].cur_epoch + assert bso_clients[0].current_batch_size == 1024 + + print("[ERROR]", e_info.value, i) + assert str(e_info.value).find("Train failed to converge within max_epoch") != -1 + + for i in range(1, NGPU): + with pytest.raises(Exception) as e_info: + bso_clients[i].on_evaluate(0.3) + assert bso_clients[i].current_batch_size == 1024 + assert str(e_info.value).find("is already reported.") != -1 diff --git a/tests/optimizer/batch_size/test_explorer.py b/tests/optimizer/batch_size/test_explorer.py new file mode 100644 index 00000000..a713ef34 --- /dev/null +++ b/tests/optimizer/batch_size/test_explorer.py @@ -0,0 +1,208 @@ +import logging +import re +import uuid +from math import isclose +from typing import Tuple + +import pytest +from zeus.optimizer.batch_size.common import ( + GET_NEXT_BATCH_SIZE_URL, + REGISTER_JOB_URL, + REPORT_RESULT_URL, + TrainingResult, + TrialId, +) +from zeus.util.metric import zeus_cost + + +@pytest.fixture(scope="session", autouse=True) +def logger_setup(): + logger = logging.getLogger( + "zeus.optimizer.batch_size.server.mab" + ) # for testing, propagate the log to the root logger so that caplog can capture + logger.propagate = True + yield + + +@pytest.mark.usefixtures("client") +@pytest.mark.usefixtures("caplog") +class TestPruningExploreManager: + """Unit test class for pruning exploration.""" + + batch_sizes: list[int] = [8, 16, 32, 64, 128, 256] + + def exploration_to_training_result( + self, + exploration: tuple[int, float, bool], + job_id: str, + trial_number: int, + max_power: int, + ) -> TrainingResult: + energy = 1 + res = TrainingResult( + job_id=job_id, + batch_size=exploration[0], + trial_number=trial_number, + error=False, + time=(2 * exploration[1] - energy) / max_power, + energy=energy, + metric=0.55 if exploration[2] else 0.4, + current_epoch=100, + ) + assert isclose( + zeus_cost(res.energy, res.time, 0.5, max_power), + exploration[1], + ) + return res + + # 0.5 * energy + (1 - eta_knob) * max_power * time + def register_job_with_default_bs(self, client, default_bs: int) -> Tuple[str, int]: + job_id = f"test-{str(uuid.uuid4())}" + fake_job = pytest.get_fake_job_config(job_id) + fake_job["beta_knob"] = None + fake_job["job_id"] = job_id + fake_job["batch_sizes"] = self.batch_sizes + fake_job["default_batch_size"] = default_bs + + response = client.post(REGISTER_JOB_URL, json=fake_job) + assert response.status_code == 201 + + return job_id, fake_job["max_power"] + + def run_exploration( + self, + client, + caplog, + job_id: str, + exploration: list[tuple[int, float, bool]], + result: list[int], + max_power: int, + ) -> None: + """Drive the pruning explore manager and check results.""" + caplog.set_level(logging.INFO) + + for exp in exploration: + response = client.get( + GET_NEXT_BATCH_SIZE_URL, + params={"job_id": job_id}, + ) + assert response.status_code == 200 + parsed_res = TrialId.parse_obj(response.json()) + assert ( + parsed_res.batch_size == exp[0] + ), f"Expected {exp[0]} but got {parsed_res.batch_size} ({exp})" + + training_result = self.exploration_to_training_result( + exp, job_id, parsed_res.trial_number, max_power + ) + response = client.post( + REPORT_RESULT_URL, + content=training_result.json(), + ) + assert response.status_code == 200, response.text + assert response.json()["converged"] == exp[2] + print(response.json()["message"]) + # Now good_bs should be equal to result! + + # this will construct mab + response = client.get( + GET_NEXT_BATCH_SIZE_URL, + params={"job_id": job_id}, + ) + assert response.status_code == 200 + + # Capture list of arms from stdout + matches = re.search(r"with arms \[(.*?)\]", caplog.text) + + if matches: + arms = [int(x) for x in matches.group(1).split(",")] + arms.sort() + assert arms == result + else: + assert False, "No output found from constructing Mab" + + def test_normal(self, client, caplog): + """Test a typical case.""" + job_id, max_power = self.register_job_with_default_bs(client, 128) + + exploration = [ + (128, 10.0, True), + (64, 9.0, True), + (32, 8.0, True), + (16, 12.0, True), + (8, 21.0, False), + (256, 15.0, True), + (32, 8.0, True), + (16, 12.0, False), + (64, 9.0, True), + (128, 10.0, True), + (256, 17.0, False), + ] + + result = [32, 64, 128] + self.run_exploration(client, caplog, job_id, exploration, result, max_power) + + def test_default_is_largest(self, client, caplog): + """Test the case when the default batch size is the largest one.""" + job_id, max_power = self.register_job_with_default_bs(client, 256) + + exploration = [ + (256, 7.0, True), + (128, 8.0, True), + (64, 9.0, True), + (32, 13.0, True), + (16, 22.0, False), + (256, 8.0, True), + (128, 8.5, True), + (64, 9.0, True), + (32, 12.0, True), + ] + result = [32, 64, 128, 256] + self.run_exploration(client, caplog, job_id, exploration, result, max_power) + + def test_default_is_smallest(self, client, caplog): + """Test the case when the default batch size is the smallest one.""" + job_id, max_power = self.register_job_with_default_bs(client, 8) + + exploration = [ + (8, 10.0, True), + (16, 17.0, True), + (32, 20.0, True), + (64, 25.0, False), + (8, 10.0, True), + (16, 21.0, False), + ] + result = [8] + self.run_exploration(client, caplog, job_id, exploration, result, max_power) + + def test_all_converge(self, client, caplog): + """Test the case when every batch size converges.""" + job_id, max_power = self.register_job_with_default_bs(client, 64) + exploration = [ + (64, 10.0, True), + (32, 8.0, True), + (16, 12.0, True), + (8, 15.0, True), + (128, 12.0, True), + (256, 13.0, True), + (32, 7.0, True), + (16, 10.0, True), + (8, 15.0, True), + (64, 10.0, True), + (128, 12.0, True), + (256, 13.0, True), + ] + result = self.batch_sizes + self.run_exploration(client, caplog, job_id, exploration, result, max_power) + + def test_every_bs_is_bs(self, client, caplog): + """Test the case when every batch size other than the default fail to converge.""" + job_id, max_power = self.register_job_with_default_bs(client, 64) + exploration = [ + (64, 10.0, True), + (32, 22.0, False), + (128, 25.0, False), + (64, 9.0, True), + ] + result = [64] + self.run_exploration(client, caplog, job_id, exploration, result, max_power) diff --git a/tests/optimizer/batch_size/test_server.py b/tests/optimizer/batch_size/test_server.py new file mode 100644 index 00000000..2f86c8a5 --- /dev/null +++ b/tests/optimizer/batch_size/test_server.py @@ -0,0 +1,357 @@ +import random + +import pytest +from zeus.optimizer.batch_size.common import ( + DELETE_JOB_URL, + GET_NEXT_BATCH_SIZE_URL, + REGISTER_JOB_URL, + REPORT_END_URL, + REPORT_RESULT_URL, +) + +# https://fastapi.tiangolo.com/tutorial/testing/ + + +def test_register_job(client): + job_config = pytest.get_fake_job_config("test_register_job") + response = client.post(REGISTER_JOB_URL, json=job_config) + print(response.text) + print(str(response)) + assert response.status_code == 201 + + response = client.post(REGISTER_JOB_URL, json=job_config) + print(response.text) + assert response.status_code == 200 + + job_config["default_batch_size"] = 512 + response = client.post(REGISTER_JOB_URL, json=job_config) + print(response.text) + assert response.status_code == 409 + + +def test_register_job_validation_error(client): + temp = pytest.get_fake_job_config("test_register_job_validation_error") + temp["default_batch_size"] = 128 + response = client.post(REGISTER_JOB_URL, json=temp) + assert response.status_code == 422 + + temp["default_batch_size"] = 0 + response = client.post(REGISTER_JOB_URL, json=temp) + assert response.status_code == 422 + + temp = pytest.get_fake_job_config("test_register_job_validation_error") + temp["max_epochs"] = 0 + response = client.post(REGISTER_JOB_URL, json=temp) + assert response.status_code == 422 + + temp = pytest.get_fake_job_config("test_register_job_validation_error") + temp["batch_sizes"] = [] + response = client.post(REGISTER_JOB_URL, json=temp) + assert response.status_code == 422 + + temp = pytest.get_fake_job_config("test_register_job_validation_error") + temp["eta_knob"] = 1.1 + response = client.post(REGISTER_JOB_URL, json=temp) + assert response.status_code == 422 + + temp = pytest.get_fake_job_config("test_register_job_validation_error") + temp["beta_knob"] = 0 + response = client.post(REGISTER_JOB_URL, json=temp) + assert response.status_code == 422 + + +def test_predict(client): + job_config = pytest.get_fake_job_config("test_predict") + response = client.post(REGISTER_JOB_URL, json=job_config) + assert response.status_code == 201 + + cur_default_bs = job_config["default_batch_size"] + response = client.get( + GET_NEXT_BATCH_SIZE_URL, + params={"job_id": job_config["job_id"]}, + ) + print(response.text) + assert response.status_code == 200 + assert response.json()["batch_size"] == cur_default_bs + assert response.json()["trial_number"] == 1 + + # concurrent job submission + response = client.get( + GET_NEXT_BATCH_SIZE_URL, + params={"job_id": job_config["job_id"]}, + ) + print(response.text) + assert response.status_code == 200 + assert response.json()["batch_size"] == cur_default_bs + assert response.json()["trial_number"] == 2 + + +def test_report(client): + job_config = pytest.get_fake_job_config("test_report") + response = client.post(REGISTER_JOB_URL, json=job_config) + assert response.status_code == 201 + + response = client.get( + GET_NEXT_BATCH_SIZE_URL, + params={"job_id": job_config["job_id"]}, + ) + # Converged within max epoch => successful training + response = client.post( + REPORT_RESULT_URL, + json={ + "job_id": job_config["job_id"], + "batch_size": response.json()["batch_size"], + "trial_number": response.json()["trial_number"], + "time": 14.438, + "energy": 3000.123, + "metric": 0.55, + "current_epoch": 98, + }, + ) + assert ( + response.status_code == 200 + and response.json()["converged"] == True + and response.json()["stop_train"] == True + ) + + response = client.get( + GET_NEXT_BATCH_SIZE_URL, + params={"job_id": job_config["job_id"]}, + ) + # Early stop + response = client.post( + REPORT_RESULT_URL, + json={ + "job_id": job_config["job_id"], + "batch_size": response.json()["batch_size"], + "trial_number": response.json()["trial_number"], + "time": 30, + "energy": 6000, + "metric": 0.55, + "current_epoch": 98, + }, + ) + assert ( + response.status_code == 200 + and response.json()["converged"] == False + and response.json()["stop_train"] == True + ), response.text + + +def test_exploration_stage(client): + job_config = pytest.get_fake_job_config("test_exploration_stage") + response = client.post(REGISTER_JOB_URL, json=job_config) + assert response.status_code == 201 + + cur_default_bs = job_config["default_batch_size"] + bss = job_config["batch_sizes"] + trial_number = 1 + for trial in range(1, job_config["num_pruning_rounds"] + 1): + idx = bss.index(cur_default_bs) + down = sorted(bss[: idx + 1], reverse=True) + up = sorted(bss[idx + 1 :]) + new_bss = [] + + print("Exploration space:", [down, up]) + for bs_list in [down, up]: + for bs in bs_list: + # Predict + response = client.get( + GET_NEXT_BATCH_SIZE_URL, + params={"job_id": job_config["job_id"]}, + ) + assert response.status_code == 200 + assert response.json()["batch_size"] == bs + assert response.json()["trial_number"] == trial_number + trial_number += 1 + + # Concurrent job + response = client.get( + GET_NEXT_BATCH_SIZE_URL, + params={"job_id": job_config["job_id"]}, + ) + assert response.status_code == 200 + assert ( + response.json()["batch_size"] == cur_default_bs + if trial == 1 and bs == 512 + else 512 + ) + assert response.json()["trial_number"] == trial_number + trial_number += 1 + + time = 14.438 + converged = random.choice([True, True, False]) + if bs == 1024: # make default bs converged for 512 + converged = True + if ( + bs == 512 + ): # make 512 as the best bs so that we can change the default bs to 512 next round + converged = True + time = 12 + if converged: + new_bss.append(bs) + + response = client.post( + REPORT_RESULT_URL, + json={ + "job_id": job_config["job_id"], + "batch_size": bs, + "error": False, + # Report to first trial in the loop + "trial_number": trial_number - 2, + "time": time, + "energy": 3000.123, + "metric": 0.55 if converged else 0.33, + "current_epoch": 98 if converged else 100, + }, + ) + assert ( + response.status_code == 200 + and response.json()["converged"] == converged + and response.json()["stop_train"] == True + ) + if not converged: + break + bss = sorted(new_bss) + cur_default_bs = 512 + + +def test_mab_stage(client): + job_config = pytest.get_fake_job_config("test_mab_stage") + response = client.post(REGISTER_JOB_URL, json=job_config) + assert response.status_code == 201 + + bs_seq = [] + # Previous default batch size is converged + for _ in range(50): + # Predict + response = client.get( + GET_NEXT_BATCH_SIZE_URL, + params={"job_id": job_config["job_id"]}, + ) + assert response.status_code == 200 + bs = response.json()["batch_size"] + trial_number = response.json()["trial_number"] + bs_seq.append(response.json()["batch_size"]) + # Concurrent job + response = client.get( + GET_NEXT_BATCH_SIZE_URL, + params={"job_id": job_config["job_id"]}, + ) + assert response.status_code == 200 + bs_seq.append(response.json()["batch_size"]) + + response = client.post( + REPORT_RESULT_URL, + json={ + "job_id": job_config["job_id"], + "batch_size": bs, + "trial_number": trial_number, + "error": False, + "time": 15.123, + "energy": 3000.123, + "max_power": 300, + "metric": 0.55, + "current_epoch": 98, + }, + ) + assert ( + response.status_code == 200 + and response.json()["converged"] == True + and response.json()["stop_train"] == True + ) + print(bs_seq) + + +def test_end_trial(client): + job_config = pytest.get_fake_job_config("test_end_trial") + response = client.post(REGISTER_JOB_URL, json=job_config) + assert response.status_code == 201 + + # Start trial + response = client.get( + GET_NEXT_BATCH_SIZE_URL, + params={"job_id": job_config["job_id"]}, + ) + assert response.status_code == 200 + trial_number = response.json()["trial_number"] + bs = response.json()["batch_size"] + + # End Trial. + response = client.patch( + REPORT_END_URL, + json={ + "job_id": job_config["job_id"], + "batch_size": bs, + "trial_number": trial_number, + }, + ) + assert response.status_code == 200 + + # Start trial + response = client.get( + GET_NEXT_BATCH_SIZE_URL, + params={"job_id": job_config["job_id"]}, + ) + assert response.status_code == 200 + trial_number = response.json()["trial_number"] + bs = response.json()["batch_size"] + + # Report result. + response = client.post( + REPORT_RESULT_URL, + json={ + "job_id": job_config["job_id"], + "batch_size": bs, + "trial_number": trial_number, + "error": False, + "time": 15.123, + "energy": 3000.123, + "max_power": 300, + "metric": 0.55, + "current_epoch": 98, + }, + ) + + # End Trial. + response = client.patch( + REPORT_END_URL, + json={ + "job_id": job_config["job_id"], + "batch_size": bs, + "trial_number": trial_number, + }, + ) + assert response.status_code == 200 + + +def test_delete_job(client): + job_config = pytest.get_fake_job_config("test_delete_job") + response = client.post(REGISTER_JOB_URL, json=job_config) + assert response.status_code == 201 + + response = client.delete( + DELETE_JOB_URL, + params={"job_id": job_config["job_id"]}, + ) + print(response.text) + assert response.status_code == 200 + + response = client.post(REGISTER_JOB_URL, json=job_config) + print(response.text) + assert response.status_code == 201 + + response = client.delete( + DELETE_JOB_URL, + params={"job_id": job_config["job_id"]}, + ) + print(response.text) + assert response.status_code == 200 + + # Job doesn't exists + response = client.delete( + DELETE_JOB_URL, + params={"job_id": "UNKNOWN"}, + ) + print(response.text) + assert response.status_code == 404 diff --git a/tests/optimizer/batch_size/test_simulator.py b/tests/optimizer/batch_size/test_simulator.py new file mode 100644 index 00000000..3163d498 --- /dev/null +++ b/tests/optimizer/batch_size/test_simulator.py @@ -0,0 +1,135 @@ +import logging +import re +from pathlib import Path +from typing import Literal + +import pandas as pd +import pytest +from pytest_mock import MockerFixture +from tests.optimizer.batch_size.simulate_with_server import SimulatorWithServer +from zeus._legacy.policy.optimizer import ( + JITPowerLimitOptimizer, + PruningGTSBatchSizeOptimizer, +) +from zeus._legacy.simulate import Simulator +from zeus.job import Job + +config = { + "gpu": "v100", + "eta_knob": 0.5, + "beta_knob": 2.0, + "seed": 1, + "dataset": "librispeech", + "model": "deepspeech2", + "optimizer": "adamw", + "target_metric": 40.0, + "max_epochs": 16, + "b_0": 192, # default_bs + "num_recurrence": None, +} + + +def arm_state_parser(output): + # Define regex patterns to match the numbers + arm_pattern = r"Arm\s+(\d+)" + mu_pattern = r"N\(([-+]?\d*\.\d+|\d+),\s+([-+]?\d*\.\d+|\d+)\)" + arrow_pattern = r"-> ([-+]?\d*\.\d+|\d+)" + + # Use regex to find the numbers in the output + arm_numbers = re.findall(arm_pattern, output, re.MULTILINE) + mu_numbers = re.findall(mu_pattern, output, re.MULTILINE) + arrow_numbers = re.findall(arrow_pattern, output, re.MULTILINE) + + d = [] + + for arm, mu, arrow in zip(arm_numbers, mu_numbers, arrow_numbers): + d.append({"Arm": arm, "Mean": mu[0], "stdev": mu[1], "Arrow": arrow}) + d.sort(key=lambda x: x["Arm"]) + return d + + +def read_trace( + gpu: Literal["a40", "v100", "p100", "rtx6000"] +) -> tuple[pd.DataFrame, pd.DataFrame]: + """Read the train and power trace files as Pandas DataFrames.""" + trace_dir = Path(__file__).resolve(strict=True).parents[3] + train_df = pd.DataFrame(pd.read_csv(trace_dir / "trace/summary_train.csv")) + power_df = pd.DataFrame(pd.read_csv(trace_dir / f"trace/summary_power_{gpu}.csv")) + return train_df, power_df + + +@pytest.fixture(scope="module", autouse=True) +def logger_setup(): + logger = logging.getLogger( + "zeus.optimizer.batch_size.server.mab" + ) # for testing, propagate the log to the root logger so that caplog can capture + logger.propagate = True + + +def test_end_to_end(client, caplog, capsys, mocker: MockerFixture): + mocker.patch("httpx.post", side_effect=client.post) + mocker.patch("httpx.get", side_effect=client.get) + mocker.patch("httpx.patch", side_effect=client.patch) + + ## CONFIG + gpu: Literal["a40", "v100", "p100", "rtx6000"] = config["gpu"] + eta_knob: float = config["eta_knob"] + beta_knob: float = config["beta_knob"] + num_recurrence: int | None = config["num_recurrence"] + seed: int = config["seed"] + + job = Job( + config["dataset"], + config["model"], + config["optimizer"], + config["target_metric"], + config["max_epochs"], + config["b_0"], + ) + + train_df, power_df = read_trace(gpu) + + # # Use 2 * |B| * |P| is num_recurrence is None. + # print(num_recurrence) + if num_recurrence is None: + job_df = job.filter_df(train_df.merge(power_df, how="inner")) + num_recurrence = ( + 2 * len(job_df.batch_size.unique()) * len(job_df.power_limit.unique()) + ) + + ### New simulator + # Instantiate optimizers. + plo = JITPowerLimitOptimizer(verbose=False) + + # Instantitate the simulator. + simulator = SimulatorWithServer( + train_df, power_df, plo, gpu, seed=seed, verbose=False + ) + # # Run the simulator. + result = simulator.simulate_one_job(job, num_recurrence, beta_knob, eta_knob) + selected_bs = [item.bs for item in result] + + ### Original Simulator + # Instantiate optimizers. + org_plo = JITPowerLimitOptimizer(verbose=False) + org_bso = PruningGTSBatchSizeOptimizer(seed=seed, verbose=True) + + # Instantitate the simulator. + original_simulator = Simulator( + train_df, power_df, org_bso, org_plo, seed=seed, verbose=False + ) + original_result = original_simulator.simulate_one_job( + job, num_recurrence, beta_knob, eta_knob + ) + org_selected_bs = [item.bs for item in original_result] + + out, err = capsys.readouterr() + records = arm_state_parser(out) + + new_sim_records = arm_state_parser(caplog.text) + + # Compare arm states + assert records == new_sim_records + + # Compare selected batch sizes + assert selected_bs == org_selected_bs diff --git a/tests/test_batch_size_optimizer.py b/tests/test_batch_size_optimizer.py deleted file mode 100644 index 57e23d92..00000000 --- a/tests/test_batch_size_optimizer.py +++ /dev/null @@ -1,121 +0,0 @@ -# Copyright (C) 2023 Jae-Won Chung -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import annotations - -import pytest - -from zeus.policy.optimizer import PruningExploreManager - - -class TestPruningExploreManager: - """Unit test class for pruning exploration.""" - - batch_sizes: list[int] = [8, 16, 32, 64, 128, 256] - - def run_exploration( - self, - manager: PruningExploreManager, - exploration: list[tuple[int, float, bool]], - result: list[int], - ) -> None: - """Drive the pruning explore manager and check results.""" - for bs, cost, reached in exploration: - assert manager.next_batch_size() == bs - manager.report_batch_size_result(bs, cost, reached) - with pytest.raises(StopIteration) as raised: - manager.next_batch_size() - assert raised.value.value == result - - def test_normal(self): - """Test a typical case.""" - manager = PruningExploreManager(self.batch_sizes, 128) - exploration = [ - (128, 10.0, True), - (64, 9.0, True), - (32, 8.0, True), - (16, 12.0, True), - (8, 21.0, False), - (256, 15.0, True), - (32, 8.0, True), - (16, 12.0, False), - (64, 9.0, True), - (128, 10.0, True), - (256, 17.0, False), - ] - result = [32, 64, 128] - self.run_exploration(manager, exploration, result) - - def test_default_is_largest(self): - """Test the case when the default batch size is the largest one.""" - manager = PruningExploreManager(self.batch_sizes, 256) - exploration = [ - (256, 7.0, True), - (128, 8.0, True), - (64, 9.0, True), - (32, 13.0, True), - (16, 22.0, False), - (256, 8.0, True), - (128, 8.5, True), - (64, 9.0, True), - (32, 12.0, True), - ] - result = [32, 64, 128, 256] - self.run_exploration(manager, exploration, result) - - def test_default_is_smallest(self): - """Test the case when the default batch size is the smallest one.""" - manager = PruningExploreManager(self.batch_sizes, 8) - exploration = [ - (8, 10.0, True), - (16, 17.0, True), - (32, 20.0, True), - (64, 25.0, False), - (8, 10.0, True), - (16, 21.0, False), - ] - result = [8] - self.run_exploration(manager, exploration, result) - - def test_all_converge(self): - """Test the case when every batch size converges.""" - manager = PruningExploreManager(self.batch_sizes, 64) - exploration = [ - (64, 10.0, True), - (32, 8.0, True), - (16, 12.0, True), - (8, 15.0, True), - (128, 12.0, True), - (256, 13.0, True), - (32, 7.0, True), - (16, 10.0, True), - (8, 15.0, True), - (64, 10.0, True), - (128, 12.0, True), - (256, 13.0, True), - ] - result = self.batch_sizes - self.run_exploration(manager, exploration, result) - - def test_every_bs_is_bs(self): - """Test the case when every batch size other than the default fail to converge.""" - manager = PruningExploreManager(self.batch_sizes, 64) - exploration = [ - (64, 10.0, True), - (32, 22.0, False), - (128, 25.0, False), - (64, 9.0, True), - ] - result = [64] - self.run_exploration(manager, exploration, result) diff --git a/zeus/__init__.py b/zeus/__init__.py index 83a6044d..c7031a3e 100644 --- a/zeus/__init__.py +++ b/zeus/__init__.py @@ -21,9 +21,9 @@ - [`controller`][zeus.controller]: A collection of controllers that influence training flow. - [`analyze`][zeus.analyze]: Functions for analyzing log files. - [`job`][zeus.job]: Job specification. -- [`simulate`][zeus.simulate]: Machinery for trace-driven Zeus. +- [`simulate`][zeus._legacy.simulate]: Machinery for trace-driven Zeus. - [`device`][zeus.device]: Abstraction of compute devices. -- [`policy`][zeus.policy]: Collection of optimization policies. +- [`_legacy`][zeus._legacy.policy]: Collection of optimization policies for reproducing the paper's result. - [`util`][zeus.util]: Utility functions and classes. """ diff --git a/zeus/_legacy/__init__.py b/zeus/_legacy/__init__.py new file mode 100644 index 00000000..d50919c2 --- /dev/null +++ b/zeus/_legacy/__init__.py @@ -0,0 +1,5 @@ +"""Zeus legacy batch size optimizer. + +In order to reproduce the paper's result, use this legacy code. +To actually use the batch size optimizer, use the `zeus.optimizer.batch_size` +""" diff --git a/zeus/policy/__init__.py b/zeus/_legacy/policy/__init__.py similarity index 60% rename from zeus/policy/__init__.py rename to zeus/_legacy/policy/__init__.py index 318fadc3..4c996708 100644 --- a/zeus/policy/__init__.py +++ b/zeus/_legacy/policy/__init__.py @@ -14,16 +14,19 @@ """Optimization policies for Zeus. -[`PowerLimitOptimizer`][zeus.policy.interface.PowerLimitOptimizer] and -[`BatchSizeOptimizer`][zeus.policy.interface.BatchSizeOptimizer] are +[`PowerLimitOptimizer`][zeus._legacy.policy.interface.PowerLimitOptimizer] and +[`BatchSizeOptimizer`][zeus._legacy.policy.interface.BatchSizeOptimizer] are abstract classes. Users can implement custom policies by extending the abstract classes and implementing required method. -Currently, [`Simulator`][zeus.simulate.Simulator] supports custom policies +Currently, [`Simulator`][zeus._legacy.simulate.Simulator] supports custom policies for both classes, while [`ZeusMaster`][zeus.run.ZeusMaster] only supports -custom [`BatchSizeOptimizer`][zeus.policy.interface.BatchSizeOptimizer]s. -Custom [`PowerLimitOptimizer`][zeus.policy.PowerLimitOptimizer]s will +custom [`BatchSizeOptimizer`][zeus._legacy.policy.interface.BatchSizeOptimizer]s. +Custom [`PowerLimitOptimizer`][zeus._legacy.policy.PowerLimitOptimizer]s will have to be integrated into [`ZeusDataLoader`][zeus.run.ZeusDataLoader]. """ -from zeus.policy.interface import BatchSizeOptimizer, PowerLimitOptimizer -from zeus.policy.optimizer import JITPowerLimitOptimizer, PruningGTSBatchSizeOptimizer +from zeus._legacy.policy.interface import BatchSizeOptimizer, PowerLimitOptimizer +from zeus._legacy.policy.optimizer import ( + JITPowerLimitOptimizer, + PruningGTSBatchSizeOptimizer, +) diff --git a/zeus/policy/interface.py b/zeus/_legacy/policy/interface.py similarity index 95% rename from zeus/policy/interface.py rename to zeus/_legacy/policy/interface.py index 463d9d19..627154e1 100644 --- a/zeus/policy/interface.py +++ b/zeus/_legacy/policy/interface.py @@ -86,11 +86,11 @@ def predict(self, job: Job, batch_size: int) -> int | None: Args: job: The job to pick the best power limit for. batch_size: The batch size chosen by the - [`BatchSizeOptimizer`][zeus.policy.BatchSizeOptimizer] for this job. + [`BatchSizeOptimizer`][zeus._legacy.policy.BatchSizeOptimizer] for this job. Returns: The best power limit, or `None` if profiling results via - [`observe`][zeus.policy.interface.PowerLimitOptimizer.observe] are needed. + [`observe`][zeus._legacy.policy.interface.PowerLimitOptimizer.observe] are needed. """ @abstractmethod diff --git a/zeus/policy/mab.py b/zeus/_legacy/policy/mab.py similarity index 100% rename from zeus/policy/mab.py rename to zeus/_legacy/policy/mab.py diff --git a/zeus/policy/optimizer.py b/zeus/_legacy/policy/optimizer.py similarity index 97% rename from zeus/policy/optimizer.py rename to zeus/_legacy/policy/optimizer.py index 80c0e0e6..b4ff97ff 100644 --- a/zeus/policy/optimizer.py +++ b/zeus/_legacy/policy/optimizer.py @@ -14,8 +14,8 @@ """Implementations of various optimization policies. -[`JITPowerLimitOptimizer`][zeus.policy.optimizer.JITPowerLimitOptimizer] and -[`PruningGTSBatchSizeOptimizer`][zeus.policy.optimizer.PruningGTSBatchSizeOptimizer] +[`JITPowerLimitOptimizer`][zeus._legacy.policy.optimizer.JITPowerLimitOptimizer] and +[`PruningGTSBatchSizeOptimizer`][zeus._legacy.policy.optimizer.PruningGTSBatchSizeOptimizer] are the implementations used in Zeus's publication. """ @@ -27,8 +27,8 @@ import numpy as np from zeus.job import Job -from zeus.policy.interface import BatchSizeOptimizer, PowerLimitOptimizer -from zeus.policy.mab import GaussianTS +from zeus._legacy.policy.interface import BatchSizeOptimizer, PowerLimitOptimizer +from zeus._legacy.policy.mab import GaussianTS class GTSBatchSizeOptimizer(BatchSizeOptimizer): @@ -47,7 +47,7 @@ def __init__( ) -> None: """Initialze the optimizer. - Refer to the constructor of [`GaussianTS`][zeus.policy.mab.GaussianTS] + Refer to the constructor of [`GaussianTS`][zeus._legacy.policy.mab.GaussianTS] for descriptions of other arguments. Args: @@ -278,7 +278,7 @@ def __init__( ) -> None: """Initialze the optimizer. - Refer to the constructor of [`GaussianTS`][zeus.policy.mab.GaussianTS] + Refer to the constructor of [`GaussianTS`][zeus._legacy.policy.mab.GaussianTS] for descriptions of other arguments. Args: diff --git a/zeus/simulate.py b/zeus/_legacy/simulate.py similarity index 99% rename from zeus/simulate.py rename to zeus/_legacy/simulate.py index b5b02878..da157d57 100644 --- a/zeus/simulate.py +++ b/zeus/_legacy/simulate.py @@ -25,7 +25,7 @@ from zeus.analyze import HistoryEntry from zeus.job import Job -from zeus.policy import BatchSizeOptimizer, PowerLimitOptimizer +from zeus._legacy.policy import BatchSizeOptimizer, PowerLimitOptimizer from zeus.util import zeus_cost diff --git a/zeus/monitor/energy.py b/zeus/monitor/energy.py index 075a9784..773a6125 100644 --- a/zeus/monitor/energy.py +++ b/zeus/monitor/energy.py @@ -65,6 +65,9 @@ class ZeusMonitor: `nvmlDeviceGetTotalEnergyConsumption` API. On older architectures, this API is not supported, so a separate Python process is used to poll `nvmlDeviceGetPowerUsage` to get power samples over time, which are integrated to compute energy consumption. + Since it is spawning the process, the monitor should not be instantiated as a global variable. + Python puts a protection to prevent creating a process in global scope. + Refer to the "Safe importing of main module" section in https://docs.python.org/3/library/multiprocessing.html for more detail. ## Integration Example diff --git a/zeus/monitor/power.py b/zeus/monitor/power.py index 8ab62c3f..ee9060b9 100644 --- a/zeus/monitor/power.py +++ b/zeus/monitor/power.py @@ -119,6 +119,9 @@ def __init__( ) -> None: """Initialize the power monitor. + Initialization should not be done in global scope due to python's protection. + Refer to the "Safe importing of main module" section in https://docs.python.org/3/library/multiprocessing.html for more detail. + Args: gpu_indices: Indices of the GPUs to monitor. If None, monitor all GPUs. update_period: Update period of the power monitor in seconds. If None, diff --git a/zeus/optimizer/batch_size/__init__.py b/zeus/optimizer/batch_size/__init__.py new file mode 100644 index 00000000..b0fa38c5 --- /dev/null +++ b/zeus/optimizer/batch_size/__init__.py @@ -0,0 +1 @@ +"""Batch size optimizer server and client.""" diff --git a/zeus/optimizer/batch_size/alembic.ini b/zeus/optimizer/batch_size/alembic.ini new file mode 100644 index 00000000..445eecee --- /dev/null +++ b/zeus/optimizer/batch_size/alembic.ini @@ -0,0 +1,116 @@ +# A generic, single database configuration. + +[alembic] +# path to migration scripts +script_location = migrations + +# template used to generate migration file names; The default value is %%(rev)s_%%(slug)s +# Uncomment the line below if you want the files to be prepended with date and time +# see https://alembic.sqlalchemy.org/en/latest/tutorial.html#editing-the-ini-file +# for all available tokens +# file_template = %%(year)d_%%(month).2d_%%(day).2d_%%(hour).2d%%(minute).2d-%%(rev)s_%%(slug)s + +# sys.path path, will be prepended to sys.path if present. +# defaults to the current working directory. +prepend_sys_path = . + +# timezone to use when rendering the date within the migration file +# as well as the filename. +# If specified, requires the python>=3.9 or backports.zoneinfo library. +# Any required deps can installed by adding `alembic[tz]` to the pip requirements +# string value is passed to ZoneInfo() +# leave blank for localtime +# timezone = + +# max length of characters to apply to the +# "slug" field +# truncate_slug_length = 40 + +# set to 'true' to run the environment during +# the 'revision' command, regardless of autogenerate +# revision_environment = false + +# set to 'true' to allow .pyc and .pyo files without +# a source .py file to be detected as revisions in the +# versions/ directory +# sourceless = false + +# version location specification; This defaults +# to migrations/versions. When using multiple version +# directories, initial revisions must be specified with --version-path. +# The path separator used here should be the separator specified by "version_path_separator" below. +# version_locations = %(here)s/bar:%(here)s/bat:migrations/versions + +# version path separator; As mentioned above, this is the character used to split +# version_locations. The default within new alembic.ini files is "os", which uses os.pathsep. +# If this key is omitted entirely, it falls back to the legacy behavior of splitting on spaces and/or commas. +# Valid values for version_path_separator are: +# +# version_path_separator = : +# version_path_separator = ; +# version_path_separator = space +version_path_separator = os # Use os.pathsep. Default configuration used for new projects. + +# set to 'true' to search source files recursively +# in each "version_locations" directory +# new in Alembic version 1.10 +# recursive_version_locations = false + +# the output encoding used when revision files +# are written from script.py.mako +# output_encoding = utf-8 + +; sqlalchemy.url = driver://user:pass@localhost/dbname + + +[post_write_hooks] +# post_write_hooks defines scripts or Python functions that are run +# on newly generated revision scripts. See the documentation for further +# detail and examples + +# format using "black" - use the console_scripts runner, against the "black" entrypoint +# hooks = black +# black.type = console_scripts +# black.entrypoint = black +# black.options = -l 79 REVISION_SCRIPT_FILENAME + +# lint with attempts to fix using "ruff" - use the exec runner, execute a binary +# hooks = ruff +# ruff.type = exec +# ruff.executable = %(here)s/.venv/bin/ruff +# ruff.options = --fix REVISION_SCRIPT_FILENAME + +# Logging configuration +[loggers] +keys = root,sqlalchemy,alembic + +[handlers] +keys = console + +[formatters] +keys = generic + +[logger_root] +level = WARN +handlers = console +qualname = + +[logger_sqlalchemy] +level = WARN +handlers = +qualname = sqlalchemy.engine + +[logger_alembic] +level = INFO +handlers = +qualname = alembic + +[handler_console] +class = StreamHandler +args = (sys.stderr,) +level = NOTSET +formatter = generic + +[formatter_generic] +format = %(levelname)-5.5s [%(name)s] %(message)s +datefmt = %H:%M:%S diff --git a/zeus/optimizer/batch_size/client.py b/zeus/optimizer/batch_size/client.py new file mode 100644 index 00000000..4df120a2 --- /dev/null +++ b/zeus/optimizer/batch_size/client.py @@ -0,0 +1,217 @@ +"""Zeus batch size optimizer client that communicates with server.""" + +from __future__ import annotations +import atexit + +import httpx +import pynvml +from zeus.callback import Callback +from zeus.monitor import ZeusMonitor +from zeus.optimizer.batch_size.common import ( + GET_NEXT_BATCH_SIZE_URL, + REGISTER_JOB_URL, + REPORT_END_URL, + REPORT_RESULT_URL, + CreatedJob, + JobSpecFromClient, + JobSpec, + TrialId, + ReportResponse, + TrainingResult, +) +from zeus.optimizer.batch_size.exceptions import ( + ZeusBSOBadOperationError, + ZeusBSOConfigError, + ZeusBSOOperationOrderError, + ZeusBSORuntimError, + ZeusBSOTrainFailError, +) +from zeus.util.logging import get_logger + +logger = get_logger(__name__) + + +class BatchSizeOptimizer(Callback): + """Batch size optimizer client that talks to server. One batch size optimizer per one training session of the job.""" + + def __init__( + self, monitor: ZeusMonitor, server_url: str, job: JobSpec, rank: int = 0 + ) -> None: + """Initialize the optimizer, and register the job to the server. + + If job is already registered, check if the job configuration is identical with previously registered config. + + Args: + monitor: zeus monitor + server_url: url of batch size optimizer server + job: job specification. Refer to `JobSpec` for job specifcatio parameters. + rank: rank of gpu in the case of distributed training. We only let rank = 0 gpu to request for a batch size. + """ + self.monitor = monitor + self.server_url = server_url + self.cur_epoch = 0 # 0-indexed + self.running_time = 0.0 + self.consumed_energy = 0.0 + self.training_finished = False + self.trial_number = 0 + self.rank = rank + + # Get max PL + pynvml.nvmlInit() + pls = [] + name = "" + for index in self.monitor.nvml_gpu_indices: + device = pynvml.nvmlDeviceGetHandleByIndex(index) + device_name = str(pynvml.nvmlDeviceGetName(device)) + if name == "": + name = device_name + elif name != device_name: + raise ZeusBSOConfigError( + f"Should use the same GPUs for training: detected({name},{device_name})" + ) + pls.append(pynvml.nvmlDeviceGetPowerManagementLimitConstraints(device)) + + if name == "": + raise ZeusBSOConfigError("No GPUs detected.") + + # set gpu configurations(max_power, number of gpus, and gpu model) + self.job = JobSpecFromClient( + **job.dict(), + max_power=(pls[0][1] // 1000) * len(monitor.gpu_indices), + number_of_gpus=len(monitor.gpu_indices), + gpu_model=name, + ) + + # Track the batch size of current job + self.current_batch_size = 0 + + # Register job + res = httpx.post(self.server_url + REGISTER_JOB_URL, content=self.job.json()) + self._handle_response(res) + + self.job = CreatedJob.parse_obj(res.json()) + + logger.critical( + "Job is registered with job_id: \x1b[31;1m%s\x1b[0m", self.job.job_id + ) + logger.info("Job is registered: %s", str(self.job)) + + def get_batch_size(self) -> int: + """Get batch size to use from the BSO server. + + Returns: + return a batch size to use for current job + + Raises: + `ZeusBSORuntimError`: if the batch size we receive is invalid + """ + if self.rank != 0: + raise ZeusBSOBadOperationError("Only rank 0 gpu can ask for a batch size.") + + if self.current_batch_size != 0: + # If we already got the batch size, return + return self.current_batch_size + + self.cur_epoch = 0 + res = httpx.get( + self.server_url + GET_NEXT_BATCH_SIZE_URL, + params={"job_id": self.job.job_id}, + ) + self._handle_response(res) + trial_id = TrialId.parse_obj(res.json()) + + if trial_id.batch_size not in self.job.batch_sizes: + raise ZeusBSORuntimError( + f"Zeus server returned a strange batch_size: {trial_id.batch_size}" + ) + + self.current_batch_size = trial_id.batch_size + self.trial_number = trial_id.trial_number + + logger.info("[BatchSizeOptimizer] Chosen batch size: %s", trial_id.batch_size) + + def report_end() -> None: + httpx.patch(self.server_url + REPORT_END_URL, content=trial_id.json()) + + atexit.register(report_end) + return trial_id.batch_size + + def on_train_begin(self) -> None: + """Start the monitor window and mark training is started.""" + self.training_finished = False + self.monitor.begin_window("BatciSizeOptimizerClient") + + def on_evaluate( + self, + metric: float, + ) -> None: + """Determine whether or not to stop training after evaluation. + + Training stops when + - `max_epochs` was reached, or + - the target metric was reached. or + - Cost exceeded the early stop threshold + + Args: + metric: Validation metric of this epoch. See also `higher_metric_is_better` in [`JobParams`][zeus.optimizer.batch_size.common.JobParams]. + + Raises: + `ZeusBSOOperationOrderError`: When `get_batch_size` was not called first. + `ZeusBSOTrainFailError`: When train failed for a chosen batch size and should be stopped. + This batch size will not be tried again. To proceed training, re-launch the training then bso will select another batch size + `ZeusBSORuntimError`: When the server returns an error + """ + if self.current_batch_size == 0: + raise ZeusBSOOperationOrderError( + "Call get_batch_size to set the batch size first" + ) + + if self.training_finished: + return + + self.cur_epoch += 1 + measurement = self.monitor.end_window("BatciSizeOptimizerClient") + + # Accumulate time and energy + self.running_time += measurement.time + self.consumed_energy += measurement.total_energy + + training_result = TrainingResult( + job_id=self.job.job_id, + batch_size=self.current_batch_size, + trial_number=self.trial_number, + time=self.running_time, + energy=self.consumed_energy, + metric=metric, + current_epoch=self.cur_epoch, + ) + + # report to the server about the result of this training + res = httpx.post( + self.server_url + REPORT_RESULT_URL, content=training_result.json() + ) + self._handle_response(res) + + parsed_response = ReportResponse.parse_obj(res.json()) + + if not parsed_response.stop_train: + # Should keep training. Re-open the window + self.monitor.begin_window("BatciSizeOptimizerClient") + else: + # Train is over. If not converged, raise an error + self.training_finished = True + if not parsed_response.converged: + raise ZeusBSOTrainFailError( + f"Train failed: {parsed_response.message}. This batch size will not be selected again. Please re-launch the training" + ) + + def _handle_response(self, res: httpx.Response) -> None: + """Check if the response is success. Otherwise raise an error with error message from the server. + + Args: + res: response from the server + """ + if not (200 <= (code := res.status_code) < 300): + raise ZeusBSORuntimError( + f"Zeus server returned status code {code}: {res.text}" + ) diff --git a/zeus/optimizer/batch_size/common.py b/zeus/optimizer/batch_size/common.py new file mode 100644 index 00000000..0b8fcddd --- /dev/null +++ b/zeus/optimizer/batch_size/common.py @@ -0,0 +1,172 @@ +"""Shared model definitions for the server and client.""" + +from __future__ import annotations + +from typing import Any, Dict, Optional + +from zeus.util.pydantic_v1 import BaseModel, root_validator, validator, Field + +REGISTER_JOB_URL = "/jobs" +DELETE_JOB_URL = "/jobs" +GET_NEXT_BATCH_SIZE_URL = "/jobs/batch_size" +REPORT_RESULT_URL = "/report" +REPORT_END_URL = "/trials" + + +class JobParams(BaseModel): + """Job parameters. + + Attributes: + job_id: unique ID for the job + batch_sizes: list of batch sizes to try + default_batch_size: first batch size to try + eta_knob: eta for computing `zeus_cost` + beta_knob: beta for early stopping. If min_cost*beta_knob < current_cost, job will be stopped by bso server. + To disable, set it to None. + target_metric: target metric to achieve for training. + higher_is_better_metric: if the goal of training is achieving higher metric than `target_metric` + max_epochs: Maximum number of epochs for a training run. + num_pruning_rounds: Number of rounds we are trying for pruning stage + window_size: For MAB, how many recent measurements to fetch for computing the arm states. If set to 0, fetch all measurements. + + mab_prior_mean: Mean of the belief prior distribution. + mab_prior_precision: Precision of the belief prior distribution. + mab_num_explorations: How many static explorations to run when no observations are available. + mab_seed: The random seed to use. + """ + + job_id: str + job_id_prefix: str + batch_sizes: list[int] + default_batch_size: int = Field(gt=0) + eta_knob: float = 0.5 + beta_knob: Optional[float] = 2.0 + target_metric: float = 0.50 + higher_is_better_metric: bool = True + max_epochs: int = Field(100, gt=0) + num_pruning_rounds: int = Field(2, ge=0) + window_size: int = 10 + + mab_prior_mean: float = 0.0 + mab_prior_precision: float = 0.0 + mab_num_explorations: int = Field(2, ge=0) + mab_seed: Optional[int] = None + + @validator("batch_sizes") + def _validate_batch_sizes(cls, bs: list[int]) -> int: + if bs is not None and len(bs) > 0: + bs.sort() + return bs + else: + raise ValueError(f"Batch Sizes = {bs} is empty") + + @validator("eta_knob") + def _validate_eta_knob(cls, v: float) -> int: + if v < 0 or v > 1: + raise ValueError("eta_knob should be in range [0,1]") + return v + + @validator("beta_knob") + def _validate_beta_knob(cls, v: float) -> int: + if v is None or v > 0: + return v + else: + raise ValueError( + f"Invalid beta_knob({v}). To disable early stop, set beta_knob = None to disable or positive value." + ) + + @root_validator(skip_on_failure=True) + def _check_default_batch_size(cls, values: Dict[str, Any]) -> Dict[str, Any]: + bs = values["default_batch_size"] + bss = values["batch_sizes"] + if bs not in bss: + raise ValueError(f"Default BS({bs}) not in batch_sizes({bss}).") + return values + + +class GpuConfig(BaseModel): + """Gpu configuration of current training.""" + + max_power: float = Field(gt=0) + number_of_gpus: int = Field(gt=0) + gpu_model: str + + @validator("gpu_model") + def _validate_gpu_model(cls, v: str) -> str: + if v is None or v == "": + raise ValueError(f"Invalid gpu_model({v}). Shouldn't be empty.") + else: + return v + + +class JobSpec(JobParams): + """Job specification that user inputs. + + Attributes: + job_id: ID of job. If none is provided, will be created by server. + + Refer [`JobParams`][`zeus.optimizer.batch_size.common.JobParams`] for other attributes. + """ + + job_id: Optional[str] + + @root_validator(skip_on_failure=True) + def _check_job_id(cls, values: Dict[str, Any]) -> Dict[str, Any]: + job_id: str | None = values.get("job_id") + prefix: str = values["job_id_prefix"] + + if job_id is not None and not job_id.startswith(prefix): + raise ValueError(f"Job_id({job_id}) does not start with prefix({prefix}).") + return values + + +class JobSpecFromClient(JobSpec, GpuConfig): + """Internal job configuration including gpu settings. Job Id is optional here.""" + + +class CreatedJob(JobParams, GpuConfig): + """Job configuration from the server. Job Id is required.""" + + +class TrialId(BaseModel): + """Response format from the server for getting a batch size to use, which is an unique idnetifier of trial. + + Attributes: + job_id: ID of job + batch_size: batch size to use. + trial_number: trial number of current training. + """ + + job_id: str + batch_size: int + trial_number: int + + +class TrainingResult(TrialId): + """Result of training for that job & batch size. + + Attributes: + time: total time consumption so far + energy: total energy consumption so far + metric: current metric value after `current_epoch` + current_epoch: current epoch of training. Server can check if the train reached the `max_epochs` + """ + + time: float + energy: float + metric: float + current_epoch: int + + +class ReportResponse(BaseModel): + """Response format from the server for client's training result report. + + Attributes: + stop_train: Whether we should stop training or not. + converged: Whether the target metric has been reached. + message: message from the server regarding training. ex) why train should be stopped. + """ + + stop_train: bool + converged: bool + message: str diff --git a/zeus/optimizer/batch_size/exceptions.py b/zeus/optimizer/batch_size/exceptions.py new file mode 100644 index 00000000..18741df9 --- /dev/null +++ b/zeus/optimizer/batch_size/exceptions.py @@ -0,0 +1,33 @@ +"""Zeus batch size optimizer client exceptions.""" + +from zeus.exception import ZeusBaseError + + +class ZeusBSORuntimError(ZeusBaseError): + """Bso server failed to process the request correctly.""" + + pass + + +class ZeusBSOTrainFailError(ZeusBaseError): + """Training is failed for the chosen batch_size.""" + + pass + + +class ZeusBSOConfigError(ZeusBaseError): + """Configuration of training doesn't meet the requirements. ex) heterogeneous GPU.""" + + pass + + +class ZeusBSOOperationOrderError(ZeusBaseError): + """Order of calling methods of BatchSizeOptimizer is wrong.""" + + pass + + +class ZeusBSOBadOperationError(ZeusBaseError): + """The usage of operations is wrong.""" + + pass diff --git a/zeus/optimizer/batch_size/migrations/.gitignore b/zeus/optimizer/batch_size/migrations/.gitignore new file mode 100644 index 00000000..b4ae88b7 --- /dev/null +++ b/zeus/optimizer/batch_size/migrations/.gitignore @@ -0,0 +1 @@ +/versions/* diff --git a/zeus/optimizer/batch_size/migrations/README.md b/zeus/optimizer/batch_size/migrations/README.md new file mode 100644 index 00000000..1054b71d --- /dev/null +++ b/zeus/optimizer/batch_size/migrations/README.md @@ -0,0 +1,38 @@ +# Generic single-database configuration using Alembic + +Alembic allows an easy data migration with sqlalchemy. You can easily change your schema or restore data using this. +For more about alembic refer to: [Alembic Tutorial](https://alembic.sqlalchemy.org/en/latest/tutorial.html) and [Blog Post](https://medium.com/@kimberly.d.benton/alembic-migrations-1191d67f9538) + +Files under `/migrations` and `alembic.ini` are auto-generated by alembic by running a `alembic init -t async migrations`. However, we need to modify `env.py` to connect the DB schema we defined in `schema.py` and set up the db url. + +`env.py`: this script will be invoked when the alembic is running the migration. Database url and db schema are set in this script. DB schema is set through `target_metadata = Base.metadata` and DB url is set by setting `"sqlalchemy.url"` + +## Dependencies + +```Bash +pip install '.[migration]' +``` + +## Generates alembic migration scripts + +This will create scripts under ./versions + +```Bash +alembic revision --autogenerate -m "Baseline: create tables" # will autogenerate tables +``` + +More about `--autogenerate` option: [Alembic Documentation](https://alembic.sqlalchemy.org/en/latest/autogenerate.html) + +## Apply migration + +1. Online (apply it to database directly) + + ```Bash + alembic upgrade head + ``` + +2. Offline (generate sql) + + ```Bash + alembic upgrade head --sql + ``` diff --git a/zeus/optimizer/batch_size/migrations/env.py b/zeus/optimizer/batch_size/migrations/env.py new file mode 100644 index 00000000..9b2c0079 --- /dev/null +++ b/zeus/optimizer/batch_size/migrations/env.py @@ -0,0 +1,98 @@ +"""Environment set up for migration and running a migration.""" + +import asyncio +from logging.config import fileConfig + +from sqlalchemy import engine_from_config +from sqlalchemy import pool +from sqlalchemy.engine import Connection +from sqlalchemy.ext.asyncio import AsyncEngine + +from alembic import context + +from zeus.optimizer.batch_size.server.config import settings +from zeus.optimizer.batch_size.server.database.schema import Base + +# this is the Alembic Config object, which provides +# access to the values within the .ini file in use. +config = context.config + +# Set database url. +config.set_main_option("sqlalchemy.url", settings.database_url) + +# Interpret the config file for Python logging. +# This line sets up loggers basically. +if config.config_file_name is not None: + fileConfig(config.config_file_name) + +# add your model's MetaData object here +# for 'autogenerate' support +# from myapp import mymodel +# target_metadata = mymodel.Base.metadata +target_metadata = Base.metadata + + +# other values from the config, defined by the needs of env.py, +# can be acquired: +# my_important_option = config.get_main_option("my_important_option") +# ... etc. + + +def run_migrations_offline() -> None: + """Run migrations in 'offline' mode. + + This configures the context with just a URL + and not an Engine, though an Engine is acceptable + here as well. By skipping the Engine creation + we don't even need a DBAPI to be available. + + Calls to context.execute() here emit the given string to the + script output. + + """ + url = config.get_main_option("sqlalchemy.url") + context.configure( + url=url, + target_metadata=target_metadata, + literal_binds=True, + dialect_opts={"paramstyle": "named"}, + ) + + with context.begin_transaction(): + context.run_migrations() + + +def do_run_migrations(connection: Connection) -> None: + """Configure context and run migrations.""" + context.configure(connection=connection, target_metadata=target_metadata) + + with context.begin_transaction(): + context.run_migrations() + + +async def run_migrations_online() -> None: + """Run migrations in 'online' mode. + + In this scenario we need to create an Engine + and associate a connection with the context. + + """ + connectable = AsyncEngine( + engine_from_config( + config.get_section(config.config_ini_section), + prefix="sqlalchemy.", + poolclass=pool.NullPool, + future=True, + ) + ) + + async with connectable.connect() as connection: + await connection.run_sync(do_run_migrations) + + await connectable.dispose() + + +if context.is_offline_mode(): + run_migrations_offline() +else: + asyncio.run(run_migrations_online()) diff --git a/zeus/optimizer/batch_size/migrations/script.py.mako b/zeus/optimizer/batch_size/migrations/script.py.mako new file mode 100644 index 00000000..55df2863 --- /dev/null +++ b/zeus/optimizer/batch_size/migrations/script.py.mako @@ -0,0 +1,24 @@ +"""${message} + +Revision ID: ${up_revision} +Revises: ${down_revision | comma,n} +Create Date: ${create_date} + +""" +from alembic import op +import sqlalchemy as sa +${imports if imports else ""} + +# revision identifiers, used by Alembic. +revision = ${repr(up_revision)} +down_revision = ${repr(down_revision)} +branch_labels = ${repr(branch_labels)} +depends_on = ${repr(depends_on)} + + +def upgrade() -> None: + ${upgrades if upgrades else "pass"} + + +def downgrade() -> None: + ${downgrades if downgrades else "pass"} diff --git a/zeus/optimizer/batch_size/migrations/versions/.gitkeep b/zeus/optimizer/batch_size/migrations/versions/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/zeus/optimizer/batch_size/server/__init__.py b/zeus/optimizer/batch_size/server/__init__.py new file mode 100644 index 00000000..10ad8b6f --- /dev/null +++ b/zeus/optimizer/batch_size/server/__init__.py @@ -0,0 +1,44 @@ +"""Zeus batch size optimizer server. + +[Description] + +Batch size optimizer is in composed of server and client. The reason for server-client architecture is that we need to maintain the states accross all trainings. +Therefore, we need a central place that is not limited by a scope of one training. + +Server's role is maintaining and updating the state of the job based on client's report. +There are three types of states. + +- Job related states: [`JobState`][zeus.optimizer.batch_size.server.job.models.JobState] +- Trial related states: [`Trial`][zeus.optimizer.batch_size.server.batch_size_state.models.Trial] +- MAB related states: [`GaussianTsArmState`][zeus.optimizer.batch_size.server.batch_size_state.models.GaussianTsArmState] + +Client's role is letting the server know that the user started a new training and report the result of training. + + +[Structure of code] + +Each domain (batch_size_state and job) is composed of repository, commands, and models. + +- Repository is the lowest layer that modifies the DB. It provides CRUD operations, and performs corresponding sql operation for a given request. +- Commands are the command (collection of arguments) to use each method in repository. It is mainly used to validate the request. +- Models are used to safely perform operations on objects. All ORM objects can be converted into these models and we also have some helper models. + +In services directory, we have a single service, `ZeusService` which performs one or more operations of repositories. It performs business logics, +and provides more complicated operations to application layer. It also has commands that validates requests of using service's method. + +[Hierarchy of program] + +``` + | Application layer | Business logic | DB operation | Storage + | | | | +Client request -> Router -> | Optimizer -> Explorer | -> ZeusService | -> JobStateRepository | <-> DB + | -> Mab | | -> BatchSizeStateRepository| + | | | | +``` + +[Database Transaction] + +Each session represent a single transaction. When Fastapi receives the request, it creates a single session. Then, at the end of the request, it commits +every operations to Database. + +""" diff --git a/zeus/optimizer/batch_size/server/batch_size_state/__init__.py b/zeus/optimizer/batch_size/server/batch_size_state/__init__.py new file mode 100644 index 00000000..d2d39e6d --- /dev/null +++ b/zeus/optimizer/batch_size/server/batch_size_state/__init__.py @@ -0,0 +1,4 @@ +"""Batch size state models, repository, and commands. + +Batch size state includes trials and GaussianTs states. +""" diff --git a/zeus/optimizer/batch_size/server/batch_size_state/commands.py b/zeus/optimizer/batch_size/server/batch_size_state/commands.py new file mode 100644 index 00000000..27a13310 --- /dev/null +++ b/zeus/optimizer/batch_size/server/batch_size_state/commands.py @@ -0,0 +1,167 @@ +"""Commands to use `BatchSizeStateRepository`.""" + +from __future__ import annotations + +from datetime import datetime +from typing import Any, Optional + +from zeus.util.pydantic_v1 import root_validator, validator, Field +from zeus.optimizer.batch_size.server.batch_size_state.models import BatchSizeBase +from zeus.optimizer.batch_size.server.database.schema import ( + TrialStatus, + TrialTable, + TrialType, +) + + +class ReadTrial(BatchSizeBase): + """Command to read a trial. + + Equivalent to primary key of Trial. + + Attributes: + job_id: ID of job + batch_size: batch size of a given trial + trial_number: number of trial + """ + + trial_number: int = Field(gt=0) + + +class CreateTrialBase(BatchSizeBase): + """Base command to create trial.""" + + type: TrialType + start_timestamp: datetime = Field(datetime.now(), const=True) + status: TrialStatus = Field(TrialStatus.Dispatched, const=True) + + class Config: + """Model configuration. + + Make it immutable after it's created. + """ + + frozen = True + + +class CreateTrial(CreateTrialBase): + """Internal command to create trial. + + trial_number is populate within ZeusService. + """ + + trial_number: int = Field(gt=0) + + class Config: + """Model configuration. + + Make it immutable after it's created. + """ + + frozen = True + + def to_orm(self) -> TrialTable: + """Create an ORM object from pydantic model. + + Returns: + `TrialTable`: ORM object representing the trial. + """ + d = self.dict() + t = TrialTable() + for k, v in d.items(): + setattr(t, k, v) + return t + + +class CreateExplorationTrial(CreateTrialBase): + """Create a exploration.""" + + type: TrialType = Field(TrialType.Exploration, const=True) + + class Config: + """Model configuration. + + Make it immutable after it's created. + """ + + frozen = True + + +class CreateMabTrial(CreateTrialBase): + """Create a MAB trial.""" + + type: TrialType = Field(TrialType.MAB, const=True) + + class Config: + """Model configuration. + + Make it immutable after it's created. + """ + + frozen = True + + +class CreateConcurrentTrial(CreateTrialBase): + """Create a exploration.""" + + type: TrialType = Field(TrialType.Concurrent, const=True) + + class Config: + """Model configuration. + + Make it immutable after it's created. + """ + + frozen = True + + +class UpdateTrial(BatchSizeBase): + """Report the result of trial.""" + + trial_number: int = Field(gt=0) + end_timestamp: datetime = Field(datetime.now(), const=True) + status: TrialStatus + time: Optional[float] = Field(None, ge=0) + energy: Optional[float] = Field(None, ge=0) + converged: Optional[bool] = None + + class Config: + """Model configuration. + + Make it immutable after it's created. + """ + + frozen = True + + @validator("status") + def _check_status(cls, s: TrialStatus) -> TrialStatus: + """Check if status is equal to Dispatched.""" + if s != TrialStatus.Dispatched: + return s + else: + raise ValueError( + f"{s} shouldn't be Dispatched since this is reporting the result." + ) + + @root_validator(skip_on_failure=True) + def _validate_sanity(cls, values: dict[str, Any]) -> dict[str, Any]: + """Validate result. + + We are checking + - if status == Failed, time/energy/converged == None. + else, time/energy/converged != None. + """ + status: TrialStatus = values["status"] + + time: float | None = values["time"] + energy: float | None = values["energy"] + converged: bool | None = values["converged"] + + if status != TrialStatus.Failed and ( + time is None or energy is None or converged is None + ): + raise ValueError( + f"Result is incomplete: time({time}), energy({energy}), converged({converged})" + ) + + return values diff --git a/zeus/optimizer/batch_size/server/batch_size_state/models.py b/zeus/optimizer/batch_size/server/batch_size_state/models.py new file mode 100644 index 00000000..bb1c641a --- /dev/null +++ b/zeus/optimizer/batch_size/server/batch_size_state/models.py @@ -0,0 +1,251 @@ +"""Pydantic models for Batch size/Trials/GaussianTsArmState.""" + +from __future__ import annotations + +from datetime import datetime +from typing import Any, Optional + +from zeus.util.pydantic_v1 import Field, root_validator, validator, BaseModel +from zeus.optimizer.batch_size.server.database.schema import ( + GaussianTsArmStateTable, + TrialStatus, + TrialType, +) + + +class BatchSizeBase(BaseModel): + """Base model for representing batch size. + + Attributes: + job_id (str): The ID of the job. + batch_size (int): The size of the batch (greater than 0). + """ + + job_id: str + batch_size: int = Field(gt=0) + + class Config: + """Model configuration. + + Make it immutable after it's created. + """ + + frozen = True + + +class Trial(BatchSizeBase): + """Pydantic model that represents Trial. + + Attributes: + job_id (str): The ID of the job. + batch_size (int): The size of the batch (greater than 0). + trial_number (int): Number of trial. + start_timestamp (datetime): Start time of trial. + end_timestamp (datetime): End time of trial. + type (TrialType): Type of this trial, which means in which stage this trial was executed. + status (TrialStatus): Status of trial + time (Optional[float]): Total time consumption of this trial. + energy (Optional[float]): Total energy consumption of this trial. + converged (Optional[bool]): Whether this trial is converged or not. + """ + + trial_number: int = Field(gt=0) + start_timestamp: datetime + end_timestamp: Optional[datetime] = Field(None) + type: TrialType + status: TrialStatus + time: Optional[float] = Field(None, ge=0) + energy: Optional[float] = Field(None, ge=0) + converged: Optional[bool] = None + + class Config: + """Model configuration. + + Enable instantiating model from an ORM object, and make it immutable after it's created. + """ + + orm_mode = True + frozen = True + + @root_validator(skip_on_failure=True) + def _validate_mab(cls, values: dict[str, Any]) -> dict[str, Any]: + """Validate trial. + + We are checking + - start_timestamp <= end_timestamp + - if status == dispatched | Failed, time/energy/converged = None + else time/energy/converged != None + """ + start_timestamp: datetime = values["start_timestamp"] + end_timestamp: datetime | None = values["end_timestamp"] + status: TrialStatus = values["status"] + time: float | None = values["time"] + energy: float | None = values["energy"] + converged: bool | None = values["converged"] + + if end_timestamp is not None and start_timestamp > end_timestamp: + raise ValueError( + f"start is earlier than end: {start_timestamp} > {end_timestamp}" + ) + if status in (TrialStatus.Dispatched, TrialStatus.Failed): + if not (time is None and energy is None and converged is None): + raise ValueError("Trial status and result is not matching.") + if status == TrialStatus.Failed and end_timestamp is None: + raise ValueError("Trial ended but end_timestamp is None.") + elif ( + time is None or energy is None or converged is None or end_timestamp is None + ): + raise ValueError( + f"Trial ended but the result is incomplete: time({time}), energy({energy}), converged({converged}), end_timestamp({end_timestamp})" + ) + + return values + + +class GaussianTsArmState(BatchSizeBase): + """Model representing Gaussian Thompson Sampling arm state. + + Attributes: + param_mean (float): Mean of the belief prior distribution. + param_precision (float): Precision of the belief prior distribution. + reward_precision (float): Precision (inverse variance) of the reward distribution. + num_observations (int): How many observations we made. + """ + + param_mean: float + param_precision: float + reward_precision: float + num_observations: int = Field(ge=0) + + class Config: + """Model configuration. + + Enable instantiating model from an ORM object, and make it immutable after it's created. + """ + + orm_mode = True + frozen = True + + def to_orm(self) -> GaussianTsArmStateTable: + """Convert pydantic model to ORM object. + + Returns: + GaussianTsArmState: The ORM object of Gaussian Arm State. + """ + d = self.dict() + g = GaussianTsArmStateTable() + for k, v in d.items(): + setattr(g, k, v) + return g + + +# Helper models + + +class TrialResult(BatchSizeBase): + """Model for reading the result of the trial. + + Refer to [`Trial`][zeus.optimizer.batch_size.server.batch_size_state.models.Trial] for attributes. + """ + + trial_number: int = Field(gt=0) + status: TrialStatus + time: float = Field(ge=0) + energy: float = Field(ge=0) + converged: bool + + class Config: + """Model configuration. + + Enable instantiating model from an ORM object, and make it immutable after it's created. + """ + + orm_mode = True + frozen = True + + @validator("status") + def _check_state(cls, s: TrialStatus) -> TrialStatus: + """Check if status is equal to succeeded.""" + if s == TrialStatus.Succeeded: + return s + else: + raise ValueError(f"{s} should be succeeded to have a valid result.") + + +class TrialResultsPerBs(BatchSizeBase): + """Model representing all succeeded results of trial for a given batch size. + + Attributes: + results (list[TrialResult]): List of TrialResult per batch size. + """ + + results: list[TrialResult] + + @root_validator(skip_on_failure=True) + def _check_explorations(cls, values: dict[str, Any]) -> dict[str, Any]: + """Validate if job_id and bs are consistent across all items in results.""" + bs: int = values["batch_size"] + job_id: str = values["job_id"] + ms: list[TrialResult] = values["results"] + ms.sort(key=lambda x: x.trial_number, reverse=True) + + for m in ms: + if job_id != m.job_id: + raise ValueError( + f"job_id doesn't correspond with results: {job_id} != {m.job_id}" + ) + if bs != m.batch_size: + raise ValueError( + f"Batch size doesn't correspond with results: {bs} != {m.batch_size}" + ) + if m.status != TrialStatus.Succeeded: + raise ValueError( + f"This list should only contain succeeded trials. Encounted trial({m.trial_number}) of status = {m.status}" + ) + + return values + + +class ExplorationsPerJob(BaseModel): + """Model representing all succeeded explorations we have done for a job. Immutable after it's created. + + Attributes: + job_id (str): The ID of the job. + explorations_per_bs (dict[int, list[Trial]]): Dictionary of "succeeded" explorations per batch size in trial_number ascending order. + """ + + job_id: str + explorations_per_bs: dict[int, list[Trial]] # BS -> Trials with exploration type + + class Config: + """Model configuration. + + Make it immutable after it's created. + """ + + frozen = True + + @root_validator(skip_on_failure=True) + def _check_explorations(cls, values: dict[str, Any]) -> dict[str, Any]: + """Check bs and job_id corresponds to explorations_per_bs and batch size is consistent.""" + job_id: str = values["job_id"] + exps_per_bs: dict[int, list[Trial]] = values["explorations_per_bs"] + + for bs, exps in exps_per_bs.items(): + # Sort ascending just in case. Sql will return asc order anyways. + exps.sort(key=lambda x: x.trial_number) + for exp in exps: + if job_id != exp.job_id: + raise ValueError( + f"job_id doesn't correspond with explorations: {job_id} != {exps.job_id}" + ) + if bs != exp.batch_size: + raise ValueError( + f"Batch size doesn't correspond with explorations: {bs} != {exps.batch_size}" + ) + if exp.type != TrialType.Exploration: + raise ValueError("Trial type is not equal to Exploration.") + if exp.status == TrialStatus.Failed: + raise ValueError("Should not include failed trial.") + + return values diff --git a/zeus/optimizer/batch_size/server/batch_size_state/repository.py b/zeus/optimizer/batch_size/server/batch_size_state/repository.py new file mode 100644 index 00000000..f1ba71ca --- /dev/null +++ b/zeus/optimizer/batch_size/server/batch_size_state/repository.py @@ -0,0 +1,253 @@ +"""Repository for batch size states(Trial, Gaussian Ts arm state).""" + +from __future__ import annotations + +from collections import defaultdict + +from sqlalchemy import and_, select, func +from sqlalchemy.ext.asyncio.session import AsyncSession +from zeus.optimizer.batch_size.server.batch_size_state.commands import ( + CreateTrial, + ReadTrial, + UpdateTrial, +) +from zeus.optimizer.batch_size.server.batch_size_state.models import ( + BatchSizeBase, + ExplorationsPerJob, + GaussianTsArmState, + Trial, + TrialResult, + TrialResultsPerBs, +) +from zeus.optimizer.batch_size.server.database.repository import DatabaseRepository +from zeus.optimizer.batch_size.server.database.schema import ( + GaussianTsArmStateTable, + TrialStatus, + TrialTable, + TrialType, +) +from zeus.optimizer.batch_size.server.exceptions import ZeusBSOValueError +from zeus.util.logging import get_logger + +logger = get_logger(__name__) + + +class BatchSizeStateRepository(DatabaseRepository): + """Repository for handling batch size related operations.""" + + def __init__(self, session: AsyncSession): + """Set db session and intialize fetched trial. We are only updating one trial per session.""" + super().__init__(session) + self.fetched_trial: Trial | None = None + self.fetched_arm: GaussianTsArmState | None = None + + async def get_next_trial_number(self, job_id: str) -> int: + """Get next trial number of a given job. Trial number starts from 1 and increase by 1 at a time.""" + stmt = select(func.max(TrialTable.trial_number)).where( + and_( + TrialTable.job_id == job_id, + ) + ) + res = await self.session.scalar(stmt) + if res is None: + return 1 + return res + 1 + + async def get_trial_results_of_bs( + self, batch_size: BatchSizeBase, window_size: int + ) -> TrialResultsPerBs: + """Load window size amount of results for a given batch size. If window size <= 0, load all of them. + + From all trials, we filter succeeded one since failed/dispatched ones doesn't have a valid result. + + Args: + batch_size (BatchSizeBase): The batch size object. + window_size (int): The size of the measurement window. + + Returns: + TrialResultsPerBs: trial results for the given batch size. + """ + stmt = ( + select(TrialTable) + .where( + and_( + TrialTable.job_id == batch_size.job_id, + TrialTable.batch_size == batch_size.batch_size, + TrialTable.status == TrialStatus.Succeeded, + ) + ) + .order_by(TrialTable.trial_number.desc()) + ) + if window_size > 0: + stmt = stmt.limit(window_size) + + res = (await self.session.scalars(stmt)).all() + return TrialResultsPerBs( + job_id=batch_size.job_id, + batch_size=batch_size.batch_size, + results=[TrialResult.from_orm(t) for t in res], + ) + + async def get_arms(self, job_id: str) -> list[GaussianTsArmState]: + """Retrieve Gaussian Thompson Sampling arms for a given job. + + Args: + job_id (str): The ID of the job. + + Returns: + List[GaussianTsArmStateModel]: List of Gaussian Thompson Sampling arms. These arms are all "good" arms (converged during pruning stage). + Refer to `GaussianTsArmStateModel`[zeus.optimizer.batch_size.server.batch_size_state.models.GaussianTsArmStateModel] for attributes. + """ + stmt = select(GaussianTsArmStateTable).where( + GaussianTsArmStateTable.job_id == job_id + ) + res = (await self.session.scalars(stmt)).all() + return [GaussianTsArmState.from_orm(arm) for arm in res] + + async def get_arm(self, bs: BatchSizeBase) -> GaussianTsArmState | None: + """Retrieve Gaussian Thompson Sampling arm for a given job id and batch size. + + Args: + bs (BatchSizeBase): The batch size object. + + Returns: + Optional[GaussianTsArmStateModel]: Gaussian Thompson Sampling arm if found, else None. + Refer to `GaussianTsArmStateModel`[zeus.optimizer.batch_size.server.batch_size_state.models.GaussianTsArmStateModel] for attributes. + """ + stmt = select(GaussianTsArmStateTable).where( + and_( + GaussianTsArmStateTable.job_id == bs.job_id, + GaussianTsArmStateTable.batch_size == bs.batch_size, + ) + ) + arm = await self.session.scalar(stmt) + if arm is None: + return None + self.fetched_arm = arm + return GaussianTsArmState.from_orm(arm) + + async def get_trial(self, trial: ReadTrial) -> Trial | None: + """Get a corresponding trial. + + Args: + trial: job_id, batch_size, trial_number. + + Returns: + Found Trial. If none found, return None. + """ + stmt = select(TrialTable).where( + TrialTable.job_id == trial.job_id, + TrialTable.batch_size == trial.batch_size, + TrialTable.trial_number == trial.trial_number, + ) + fetched_trial = await self.session.scalar(stmt) + + if fetched_trial is None: + logger.info("get_trial: NoResultFound") + return None + + self.fetched_trial = fetched_trial + return Trial.from_orm(fetched_trial) + + def get_trial_from_session(self, trial: ReadTrial) -> Trial | None: + """Fetch a trial from the session.""" + if ( + self.fetched_trial.job_id != trial.job_id + or self.fetched_trial.batch_size != trial.batch_size + or self.fetched_trial.trial_number != trial.trial_number + ): + return None + return self.fetched_trial + + def create_trial(self, trial: CreateTrial) -> None: + """Create a trial in db. + + Refer to `CreateTrial`[zeus.optimizer.batch_size.server.batch_size_state.models.CreateTrial] for attributes. + + Args: + trial (CreateTrial): The trial to add. + """ + self.session.add(trial.to_orm()) + + def updated_current_trial(self, updated_trial: UpdateTrial) -> None: + """Update trial in the database (report the result of trial). + + Args: + updated_trial (UpdateTrial): The updated trial. Refer to `UpdateTrial`[zeus.optimizer.batch_size.server.batch_size_state.models.UpdateTrial] for attributes. + """ + if self.fetched_trial is None: + raise ZeusBSOValueError("No trial is fetched.") + + if ( + self.fetched_trial.job_id != updated_trial.job_id + or self.fetched_trial.batch_size != updated_trial.batch_size + or self.fetched_trial.trial_number != updated_trial.trial_number + ): + raise ZeusBSOValueError("Trying to update invalid trial.") + + self.fetched_trial.end_timestamp = updated_trial.end_timestamp + self.fetched_trial.status = updated_trial.status + self.fetched_trial.time = updated_trial.time + self.fetched_trial.energy = updated_trial.energy + self.fetched_trial.converged = updated_trial.converged + + def create_arms(self, new_arms: list[GaussianTsArmState]) -> None: + """Create Gaussian Thompson Sampling arms in the database. + + Args: + new_arms (List[GaussianTsArmStateModel]): List of new arms to create. + Refer to `GaussianTsArmStateModel`[zeus.optimizer.batch_size.server.batch_size_state.models.GaussianTsArmStateModel] for attributes. + """ + self.session.add_all([arm.to_orm() for arm in new_arms]) + + def update_arm_state(self, updated_mab_state: GaussianTsArmState) -> None: + """Update Gaussian Thompson Sampling arm state in db. + + Args: + updated_mab_state (GaussianTsArmStateModel): The updated arm state. + Refer to `GaussianTsArmStateModel`[zeus.optimizer.batch_size.server.batch_size_state.models.GaussianTsArmStateModel] for attributes. + """ + if self.fetched_arm is None: + raise ZeusBSOValueError("No arm is fetched.") + + if ( + self.fetched_arm.job_id != updated_mab_state.job_id + or self.fetched_arm.batch_size != updated_mab_state.batch_size + ): + raise ZeusBSOValueError( + "Fetch arm does not correspond with the arm trying to update." + ) + + self.fetched_arm.param_mean = updated_mab_state.param_mean + self.fetched_arm.param_precision = updated_mab_state.param_precision + self.fetched_arm.reward_precision = updated_mab_state.reward_precision + self.fetched_arm.num_observations = updated_mab_state.num_observations + + async def get_explorations_of_job(self, job_id: str) -> ExplorationsPerJob: + """Retrieve succeeded or ongoing explorations for a given job. + + Args: + job_id: ID of the job + + Returns: + ExplorationsPerJob: Explorations for the given batch size. + Refer to `ExplorationsPerJob`[zeus.optimizer.batch_size.server.batch_size_state.models.ExplorationsPerJob] for attributes. + """ + stmt = ( + select(TrialTable) + .where( + and_( + TrialTable.job_id == job_id, + TrialTable.type == TrialType.Exploration, + TrialTable.status != TrialStatus.Failed, + ) + ) + .order_by(TrialTable.trial_number.asc()) + ) + + explorations = (await self.session.scalars(stmt)).all() + exps_per_bs: defaultdict[int, list[Trial]] = defaultdict(list) + for exp in explorations: + exps_per_bs[exp.batch_size].append(Trial.from_orm(exp)) + + return ExplorationsPerJob(job_id=job_id, explorations_per_bs=exps_per_bs) diff --git a/zeus/optimizer/batch_size/server/config.py b/zeus/optimizer/batch_size/server/config.py new file mode 100644 index 00000000..2a57c182 --- /dev/null +++ b/zeus/optimizer/batch_size/server/config.py @@ -0,0 +1,59 @@ +"""Server global configurations.""" + +from __future__ import annotations +from typing import Union + +from dotenv import find_dotenv +from zeus.util.pydantic_v1 import BaseSettings, validator + + +class ZeusBsoSettings(BaseSettings): + """App setting. + + Attributes: + database_url: url of database for the server + echo_sql: log sql statements it executes + log_level: level of log + """ + + database_url: str + echo_sql: Union[bool, str] = False # To prevent conversion error for empty string + log_level: str = "INFO" + + class Config: + """Model configuration. + + Set how to find the env variables and how to parse it. + """ + + env_prefix = "ZEUS_BSO_" + env_file = find_dotenv(filename=".env") + env_file_encoding = "utf-8" + + @validator("echo_sql") + def _validate_echo_sql(cls, v) -> bool: + if v is not None and isinstance(v, bool): + return v + elif v is not None and isinstance(v, str): + if v.lower() == "false": + return False + elif v.lower() == "true": + return True + return False + + @validator("log_level") + def _validate_log_level(cls, v) -> bool: + if v is None or v not in { + "NOTSET", + "DEBUG", + "INFO", + "WARN", + "ERROR", + "CRITICAL", + }: + # Default log level + return "INFO" + return v + + +settings = ZeusBsoSettings() diff --git a/zeus/optimizer/batch_size/server/database/__init__.py b/zeus/optimizer/batch_size/server/database/__init__.py new file mode 100644 index 00000000..63ae7521 --- /dev/null +++ b/zeus/optimizer/batch_size/server/database/__init__.py @@ -0,0 +1 @@ +"""Manage database connection and define schema.""" diff --git a/zeus/optimizer/batch_size/server/database/db_connection.py b/zeus/optimizer/batch_size/server/database/db_connection.py new file mode 100644 index 00000000..799f5f7c --- /dev/null +++ b/zeus/optimizer/batch_size/server/database/db_connection.py @@ -0,0 +1,77 @@ +"""Managing database connection. + +Heavily inspired by https://praciano.com.br/fastapi-and-async-sqlalchemy-20-with-pytest-done-right.html +and https://medium.com/@tclaitken/setting-up-a-fastapi-app-with-async-sqlalchemy-2-0-pydantic-v2-e6c540be4308 +""" + +import contextlib +from typing import Any, AsyncIterator + +from sqlalchemy.ext.asyncio import ( + AsyncConnection, + AsyncSession, + async_sessionmaker, + create_async_engine, +) +from zeus.optimizer.batch_size.server.config import settings +from zeus.optimizer.batch_size.server.exceptions import ZeusBSOServerRuntimeError + + +class DatabaseSessionManager: + """Session manager class.""" + + def __init__(self, host: str, engine_kwargs: dict[str, Any] = None): + """Create async engine and session maker.""" + if engine_kwargs is None: + engine_kwargs = {} + self._engine = create_async_engine(host, **engine_kwargs) + self._sessionmaker = async_sessionmaker(autocommit=False, bind=self._engine) + + async def close(self): + """Close connection.""" + if self._engine is None: + raise ZeusBSOServerRuntimeError("DatabaseSessionManager is not initialized") + await self._engine.dispose() + + self._engine = None + self._sessionmaker = None + + @contextlib.asynccontextmanager + async def connect(self) -> AsyncIterator[AsyncConnection]: + """Connect to db.""" + if self._engine is None: + raise ZeusBSOServerRuntimeError("DatabaseSessionManager is not initialized") + + async with self._engine.begin() as connection: + try: + yield connection + except Exception: + await connection.rollback() + raise + + @contextlib.asynccontextmanager + async def session(self) -> AsyncIterator[AsyncSession]: + """Get session from session maker.""" + if self._sessionmaker is None: + raise ZeusBSOServerRuntimeError("DatabaseSessionManager is not initialized") + + session = self._sessionmaker() + try: + yield session + except Exception: + await session.rollback() + raise + finally: + await session.close() + + +# Initialize session manager. +sessionmanager = DatabaseSessionManager( + settings.database_url, {"echo": settings.echo_sql} +) + + +async def get_db_session() -> AsyncIterator[AsyncSession]: + """Get db session from session manager. Used with fastapi dependency injection.""" + async with sessionmanager.session() as session: + yield session diff --git a/zeus/optimizer/batch_size/server/database/repository.py b/zeus/optimizer/batch_size/server/database/repository.py new file mode 100644 index 00000000..a7e73057 --- /dev/null +++ b/zeus/optimizer/batch_size/server/database/repository.py @@ -0,0 +1,16 @@ +"""Database repository (directly interacting with db) base class.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from sqlalchemy.ext.asyncio.session import AsyncSession + + +class DatabaseRepository: + """Base class for all repositories.""" + + def __init__(self, session: AsyncSession) -> None: + """Initizalize session.""" + self.session = session diff --git a/zeus/optimizer/batch_size/server/database/schema.py b/zeus/optimizer/batch_size/server/database/schema.py new file mode 100644 index 00000000..b46dfac2 --- /dev/null +++ b/zeus/optimizer/batch_size/server/database/schema.py @@ -0,0 +1,192 @@ +"""Database schema.""" + +from __future__ import annotations + +import enum +from datetime import datetime +from typing import Optional + +from sqlalchemy import ( + Boolean, + DateTime, + Enum, + Float, + ForeignKey, + ForeignKeyConstraint, + Integer, +) +from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column, relationship +from sqlalchemy.sql.sqltypes import VARCHAR +from zeus.optimizer.batch_size.server.job.models import Stage + + +class Base(DeclarativeBase): + """Base class for schemas.""" + + pass + + +class JobTable(Base): + """Job table schema. + + Refer to [`JobState`][zeus.optimizer.batch_size.server.job.models.JobState] for attributes. + """ + + __tablename__ = "Job" + + job_id: Mapped[str] = mapped_column(VARCHAR(400), primary_key=True) + job_id_prefix: Mapped[str] = mapped_column(VARCHAR(300), nullable=False) + default_batch_size: Mapped[int] = mapped_column(Integer, nullable=False) + higher_is_better_metric: Mapped[bool] = mapped_column(Boolean, default=True) + eta_knob: Mapped[float] = mapped_column(Float, default=0.5) + beta_knob: Mapped[Optional[float]] = mapped_column(Float, nullable=True) + target_metric: Mapped[float] = mapped_column(Float, default=0.5) + max_epochs: Mapped[int] = mapped_column(Integer, default=100) + num_pruning_rounds: Mapped[int] = mapped_column(Integer, default=2) + window_size: Mapped[int] = mapped_column(Integer, default=10) + + max_power: Mapped[float] = mapped_column(Float, nullable=False) + number_of_gpus: Mapped[int] = mapped_column(Integer, nullable=False) + gpu_model: Mapped[str] = mapped_column(VARCHAR(length=30), nullable=False) + + mab_prior_mean: Mapped[float] = mapped_column(Float, default=0.0) + mab_prior_precision: Mapped[float] = mapped_column(Float, default=0.0) + mab_num_explorations: Mapped[int] = mapped_column(Integer, default=2) + mab_seed: Mapped[Optional[int]] = mapped_column(Integer, nullable=True) + + mab_random_generator_state: Mapped[Optional[str]] = mapped_column( + VARCHAR(length=10000), nullable=True + ) + exp_default_batch_size: Mapped[int] = mapped_column(Integer, nullable=False) + + stage: Mapped[Stage] = mapped_column(Enum(Stage), default=Stage.Pruning) + min_cost: Mapped[Optional[float]] = mapped_column(Float, nullable=True) + min_cost_batch_size: Mapped[int] = mapped_column(Integer, nullable=False) + + batch_sizes: Mapped[list["BatchSizeTable"]] = relationship( + order_by="BatchSizeTable.batch_size.asc()", + back_populates="job", + # always fetch batch size(int) whenever we fetch the job. + # https://docs.sqlalchemy.org/en/14/orm/loading_relationships.html#relationship-loading-techniques + lazy="joined", + # Delete all children if the job gets deleted. + # https://docs.sqlalchemy.org/en/20/orm/cascades.html + cascade="all, delete-orphan", + ) + + +class BatchSizeTable(Base): + """Batch size states table schema. Represents one batch size of a job. + + (job_id, batch_size) as a pk, and have three states(exploration, measurement, GaussianTs arm state) as fk. + For explorations and measurements, one-to-many relationship. For arm_state, one-to-(zero or one) relationship. + """ + + __tablename__ = "BatchSize" + + job_id: Mapped[str] = mapped_column( + ForeignKey( + "Job.job_id", + ondelete="CASCADE", + ), + primary_key=True, + ) + batch_size: Mapped[int] = mapped_column(Integer, primary_key=True) + + trials: Mapped[list["TrialTable"]] = relationship( + back_populates="batch_size_state", cascade="all, delete-orphan" + ) + + arm_state: Mapped[Optional["GaussianTsArmStateTable"]] = relationship( + back_populates="batch_size_state", # populates GaussianTsArmState->BatchSize + # https://stackoverflow.com/questions/39869793/when-do-i-need-to-use-sqlalchemy-back-populates + cascade="all, delete-orphan", + ) + + job: Mapped["JobTable"] = relationship(back_populates="batch_sizes") + + +class GaussianTsArmStateTable(Base): + """Gaussian arm state schema. Represents a gaussian thompson arm states of a batch size. + + Refer [`GaussianTsArmState`][zeus.optimizer.batch_size.server.batch_size_state.models.GaussianTsArmState] for attributes. + """ + + __tablename__ = "GaussianTsArmState" + + job_id: Mapped[str] = mapped_column(VARCHAR(300), primary_key=True) + batch_size: Mapped[int] = mapped_column(Integer, primary_key=True) # arm + + param_mean: Mapped[float] = mapped_column(Float, default=0.0) + param_precision: Mapped[float] = mapped_column(Float, default=0.0) + reward_precision: Mapped[float] = mapped_column(Float, default=0.0) + num_observations: Mapped[int] = mapped_column(Integer, default=0) + + batch_size_state: Mapped["BatchSizeTable"] = relationship( + back_populates="arm_state" + ) + + __table_args__ = ( + ForeignKeyConstraint( + [job_id, batch_size], + [BatchSizeTable.job_id, BatchSizeTable.batch_size], + ondelete="CASCADE", + ), + ) + + +class TrialType(enum.Enum): + """Type of trial. + + Exploration is a trial done during Pruning stage. + Concurrent is a trial done as a concurrent job submission. + MAB is a trial done during the MAB stage. + """ + + Exploration = "Exploration" + Concurrent = "Concurrent" + MAB = "MAB" + + +class TrialStatus(enum.Enum): + """Status of trial. + + Dispatched means this trial is issued. + Succeded means trial ended without error. + Failed means trial ended with error. + """ + + Dispatched = "Dispatched" + Succeeded = "Succeeded" + Failed = "Failed" + + +class TrialTable(Base): + """Represents each trial of training. + + Refer [`Trial`][zeus.optimizer.batch_size.server.batch_size_state.models.Trial] for attributes. + """ + + __tablename__ = "Trial" + + job_id: Mapped[str] = mapped_column(VARCHAR(300), primary_key=True, nullable=False) + batch_size: Mapped[int] = mapped_column(Integer, nullable=False) + trial_number: Mapped[int] = mapped_column(Integer, primary_key=True, nullable=False) + start_timestamp: Mapped[datetime] = mapped_column(DateTime, nullable=False) + type: Mapped[TrialType] = mapped_column(Enum(TrialType), nullable=False) + status: Mapped[TrialStatus] = mapped_column(Enum(TrialStatus), nullable=False) + + end_timestamp: Mapped[Optional[datetime]] = mapped_column(DateTime, nullable=True) + time: Mapped[Optional[float]] = mapped_column(Float, nullable=True) + energy: Mapped[Optional[float]] = mapped_column(Float, nullable=True) + converged: Mapped[Optional[bool]] = mapped_column(Boolean, nullable=True) + + batch_size_state: Mapped["BatchSizeTable"] = relationship(back_populates="trials") + + __table_args__ = ( + ForeignKeyConstraint( + [job_id, batch_size], + [BatchSizeTable.job_id, BatchSizeTable.batch_size], + ondelete="CASCADE", + ), + ) diff --git a/zeus/optimizer/batch_size/server/exceptions.py b/zeus/optimizer/batch_size/server/exceptions.py new file mode 100644 index 00000000..3bb09a60 --- /dev/null +++ b/zeus/optimizer/batch_size/server/exceptions.py @@ -0,0 +1,57 @@ +"""Zeus server exceptions.""" + +from zeus.exception import ZeusBaseError + + +class ZeusBSOServerBaseError(ZeusBaseError): + """Base error class for BSO server.""" + + def __init__(self, msg: str): + """Set status code.""" + super().__init__(msg) + self.status_code = 500 + + +class ZeusBSOJobConfigMismatchError(ZeusBSOServerBaseError): + """When the job configuration doesn't align for the same job_id.""" + + def __init__(self, msg: str): + """Set status code.""" + super().__init__(msg) + self.status_code = 409 + + +class ZeusBSOValueError(ZeusBSOServerBaseError): + """When the certain value is invalid.""" + + def __init__(self, msg: str): + """Set status code.""" + super().__init__(msg) + self.status_code = 400 + + +class ZeusBSOServerNotFoundError(ZeusBSOServerBaseError): + """Resource we are looking for is not found.""" + + def __init__(self, msg: str): + """Set status code.""" + super().__init__(msg) + self.status_code = 404 + + +class ZeusBSOServiceBadOperationError(ZeusBSOServerBaseError): + """When the operation doesn't meet requirements. ex) fetching measurements before fetching a job.""" + + def __init__(self, msg: str): + """Set status code.""" + super().__init__(msg) + self.status_code = 400 + + +class ZeusBSOServerRuntimeError(ZeusBSOServerBaseError): + """Initialization or other errors during runtime.""" + + def __init__(self, msg: str): + """Set status code.""" + super().__init__(msg) + self.status_code = 500 diff --git a/zeus/optimizer/batch_size/server/explorer.py b/zeus/optimizer/batch_size/server/explorer.py new file mode 100644 index 00000000..ce7dc858 --- /dev/null +++ b/zeus/optimizer/batch_size/server/explorer.py @@ -0,0 +1,126 @@ +"""Provides report/next_batch_size during pruning stage.""" + +from __future__ import annotations + +from zeus.optimizer.batch_size.server.batch_size_state.commands import ( + CreateConcurrentTrial, + CreateExplorationTrial, + ReadTrial, +) +from zeus.optimizer.batch_size.server.batch_size_state.models import ExplorationsPerJob +from zeus.optimizer.batch_size.server.database.schema import TrialStatus +from zeus.optimizer.batch_size.server.exceptions import ZeusBSOServerRuntimeError +from zeus.optimizer.batch_size.server.job.models import JobState +from zeus.optimizer.batch_size.server.services.service import ZeusService +from zeus.util.logging import get_logger +from zeus.util.metric import zeus_cost + +logger = get_logger(__name__) + + +class PruningExploreManager: + """Pruning manager that manges the batch size states in pruning stage.""" + + def __init__(self, service: ZeusService): + """Set up zeus service.""" + self.service = service + + async def next_batch_size( + self, + job: JobState, + exploration_history: ExplorationsPerJob, + ) -> ReadTrial | list[int]: + """Find the next batch size to explore. + + Three cases possible. + 1. Pruninig Stage : There is a batch size that has not explored during the round. + 2. Concurrent job : There is an exploration with "Dispatched" state. + 3. Mab stage : All batch sizes have been explored and round is over. + + Args: + job: state of the job + exploration_history: all "succeeded" explorations that we have done for that job + + Returns: + Return the batch size to use during Pruning stage. + If Pruning stage was over, return None. + + Raises: + `ZeusBSOValueError`: If the value is invalid. EX) default batch size is not in the converged batch size list. + """ + batch_sizes = job.batch_sizes + exp_default_bs = job.default_batch_size + + for round in range(job.num_pruning_rounds): + converged_bs_list = [] + + min_cost_of_round = float("inf") + min_batch_size_of_round = 0 + + batch_sizes.sort() + idx = batch_sizes.index(exp_default_bs) + down = sorted(batch_sizes[: idx + 1], reverse=True) + up = sorted(batch_sizes[idx + 1 :]) + + for bs_list in [down, up]: + for bs in bs_list: + if ( + bs in exploration_history.explorations_per_bs + and len(exploration_history.explorations_per_bs[bs]) > round + ): + # Already explored at this round + if ( + exploration_history.explorations_per_bs[bs][round].status + == TrialStatus.Dispatched + ): + # We are waiting for the result of this exploration -> Concurrent job! + return await self.service.create_trial( + CreateConcurrentTrial( + job_id=job.job_id, + batch_size=job.min_cost_batch_size, + ) + ) + + if not exploration_history.explorations_per_bs[bs][ + round + ].converged: + # Failed to converge -> Go to next list or round + break + else: + # Training converged. + converged_bs_list.append(bs) + + m = exploration_history.explorations_per_bs[bs][round] + cost = zeus_cost( + m.energy, m.time, job.eta_knob, job.max_power + ) + if cost < min_cost_of_round: + min_cost_of_round = cost + min_batch_size_of_round = bs + + else: + # Did not explore this round. Should explore! + return await self.service.create_trial( + CreateExplorationTrial( + job_id=job.job_id, + batch_size=bs, + ) + ) + + # We should go to next round. Update exp_default_bs and batch sizes! + exp_default_bs = min_batch_size_of_round + batch_sizes = converged_bs_list + + logger.info( + "[PruningExploreManager] go to next round(%d) new default bs = %d converged bs list = %s", + round, + exp_default_bs, + batch_sizes, + ) + + if len(batch_sizes) == 0: + raise ZeusBSOServerRuntimeError( + "No converged batch sizes has observed. Reconfigure batch_sizes and re-launch the job." + ) + # After going through pruning rounds, we couldn't find the bs. Should go to MAB stage, so return good batch_sizes. + return sorted(batch_sizes) diff --git a/zeus/optimizer/batch_size/server/job/__init__.py b/zeus/optimizer/batch_size/server/job/__init__.py new file mode 100644 index 00000000..e4f775f5 --- /dev/null +++ b/zeus/optimizer/batch_size/server/job/__init__.py @@ -0,0 +1 @@ +"""Models, commands, and repository for job states.""" diff --git a/zeus/optimizer/batch_size/server/job/commands.py b/zeus/optimizer/batch_size/server/job/commands.py new file mode 100644 index 00000000..311c138b --- /dev/null +++ b/zeus/optimizer/batch_size/server/job/commands.py @@ -0,0 +1,160 @@ +"""Commands to use `JobStateRepository`.""" + +from __future__ import annotations + +import json +from typing import Any, Optional + +import numpy as np +from zeus.util.pydantic_v1 import root_validator, validator, Field, BaseModel +from zeus.optimizer.batch_size.common import GpuConfig, JobSpecFromClient, JobParams +from zeus.optimizer.batch_size.server.database.schema import BatchSizeTable, JobTable +from zeus.optimizer.batch_size.server.job.models import Stage + + +class UpdateExpDefaultBs(BaseModel): + """Parameters to update the exploration default batch size. + + Attributes: + job_id: Job Id. + exp_default_batch_size: new default batch size to use. + """ + + job_id: str + exp_default_batch_size: int = Field(gt=0) + + +class UpdateJobStage(BaseModel): + """Parameters to update the job stage. + + Attributes: + job_id: Job Id. + stage: Set it to MAB since we only go from Pruning to MAB. + """ + + job_id: str + stage: Stage = Field(Stage.MAB, const=True) + + +class UpdateGeneratorState(BaseModel): + """Parameters to update the generator state. + + Attributes: + job_id: Job Id. + state: Generator state. + """ + + job_id: str + state: str + + @validator("state") + def _validate_state(cls, state: str) -> str: + """Validate the sanity of state.""" + try: + np.random.default_rng(1).__setstate__(json.loads(state)) + return state + except (TypeError, ValueError) as err: + raise ValueError(f"Invalid generator state ({state})") from err + + +class UpdateJobMinCost(BaseModel): + """Parameters to update the min training cost and corresponding batch size. + + Attributes: + job_id: Job Id. + min_cost: Min training cost. + min_cost_batch_size: Corresponding batch size. + """ + + job_id: str + min_cost: float = Field(ge=0) + min_cost_batch_size: int = Field(gt=0) + + +class CreateJob(GpuConfig, JobParams): + """Parameters to create a new job. + + Attributes: + exp_default_batch_size: Exploration default batch size that is used during Pruning stage. + min_cost: Min training cost observed. Initially, None. + min_cost_batch_size: Batch size that has minimum training cost observed. + stage: Stage of the job. + mab_random_generator_state: Generator state if mab_seed is not None. Otherwise, None. + + For the rest of attributes, refer to `JobParams`[zeus.optimizer.batch_size.common.JobParams] and `GpuConfig`[zeus.optimizer.batch_size.common.GpuConfig] + """ + + exp_default_batch_size: int + min_cost: None = Field(None, const=True) + min_cost_batch_size: int + stage: Stage = Field(Stage.Pruning, const=True) + mab_random_generator_state: Optional[str] = None + + class Config: + """Model configuration. + + Make it immutable after creation. + """ + + frozen = True + + @root_validator(skip_on_failure=True) + def _validate_states(cls, values: dict[str, Any]) -> dict[str, Any]: + """Validate Job states. + + We are checking, + - If mab seed and generator state is matching. + - If default, exp_default, min batch sizes are correctly intialized. + - If default batch size is in the list of batch sizes. + """ + state: str | None = values["mab_random_generator_state"] + mab_seed: int | None = values["mab_seed"] + bss: list[int] = values["batch_sizes"] + dbs: int = values["default_batch_size"] + ebs: int = values["exp_default_batch_size"] + mbs: int = values["min_cost_batch_size"] + + if mab_seed is not None: + if state is None: + raise ValueError("mab_seed is not none, but generator state is none") + else: + try: + np.random.default_rng(1).__setstate__(json.loads(state)) + except (TypeError, ValueError) as err: + raise ValueError(f"Invalid generator state ({state})") from err + + if not (dbs == ebs == mbs): + raise ValueError( + f"During initialization, default_batch_size({dbs}), exp_default_batch_size({ebs}), min_batch_size({mbs}) should be all the same" + ) + if dbs not in bss: + raise ValueError( + f"default_batch_size({dbs}) is not in the batch size list({bss})" + ) + + return values + + def from_job_config(js: JobSpecFromClient) -> "CreateJob": + """From JobConfig, instantiate `CreateJob`. + + Initialize generator state, exp_default_batch_size, and min_cost_batch_size. + """ + d = js.dict() + d["exp_default_batch_size"] = js.default_batch_size + if js.mab_seed is not None: + rng = np.random.default_rng(js.mab_seed) + d["mab_random_generator_state"] = json.dumps(rng.__getstate__()) + d["min_cost_batch_size"] = js.default_batch_size + return CreateJob.parse_obj(d) + + def to_orm(self) -> JobTable: + """Convert pydantic model `CreateJob` to ORM object Job.""" + d = self.dict() + job = JobTable() + for k, v in d.items(): + if k != "batch_sizes": + setattr(job, k, v) + job.batch_sizes = [ + BatchSizeTable(job_id=self.job_id, batch_size=bs) for bs in self.batch_sizes + ] + return job diff --git a/zeus/optimizer/batch_size/server/job/models.py b/zeus/optimizer/batch_size/server/job/models.py new file mode 100644 index 00000000..3e3fe562 --- /dev/null +++ b/zeus/optimizer/batch_size/server/job/models.py @@ -0,0 +1,80 @@ +"""Pydantic models for Job.""" + +from __future__ import annotations + +import json +from enum import Enum +from typing import Any, Optional + +import numpy as np +from zeus.util.pydantic_v1 import root_validator +from pydantic.utils import GetterDict +from zeus.optimizer.batch_size.common import GpuConfig, JobParams + + +class Stage(Enum): + """Job Stage.""" + + Pruning = "Pruning" + MAB = "MAB" + + +class JobGetter(GetterDict): + """Getter for batch size to convert ORM batch size object to integer.""" + + def get(self, key: str, default: Any) -> Any: + """Get value from dict.""" + if key == "batch_sizes": + # If the key is batch_sizes, parse the integer from object. + return [bs.batch_size for bs in self._obj.batch_sizes] + + return super().get(key, default) + + +class JobState(JobParams, GpuConfig): + """Pydantic model for Job which includes job-level states. + + Attributes: + exp_default_batch_size: Exploration default batch size that is used during Pruning stage. + min_cost: Min training cost observed. Initially, None. + min_cost_batch_size: Batch size that has minimum training cost observed. + stage: Stage of the job. + mab_random_generator_state: Generator state if mab_seed is not None. Otherwise, None. + + For the rest of attributes, refer to [`JobParams`][zeus.optimizer.batch_size.common.JobParams] and [`GpuConfig`][zeus.optimizer.batch_size.common.GpuConfig] + """ + + exp_default_batch_size: int + + min_cost: Optional[float] = None + min_cost_batch_size: int + stage: Stage = Stage.Pruning + + mab_random_generator_state: Optional[str] = None + + class Config: + """Model configuration. + + Allow instantiating the model from an ORM object. + """ + + orm_mode = True + getter_dict = JobGetter + + @root_validator(skip_on_failure=True) + def _validate_mab(cls, values: dict[str, Any]) -> dict[str, Any]: + """Validate generator state.""" + state: str | None = values["mab_random_generator_state"] + mab_seed: int | None = values["mab_seed"] + + if mab_seed is not None: + if state is None: + raise ValueError("mab_seed is not none, but generator state is none") + else: + try: + # Check sanity of the generator state. + np.random.default_rng(1).__setstate__(json.loads(state)) + except (TypeError, ValueError) as err: + raise ValueError(f"Invalid generator state ({state})") from err + + return values diff --git a/zeus/optimizer/batch_size/server/job/repository.py b/zeus/optimizer/batch_size/server/job/repository.py new file mode 100644 index 00000000..bb6f821c --- /dev/null +++ b/zeus/optimizer/batch_size/server/job/repository.py @@ -0,0 +1,168 @@ +"""Repository for manipulating Job table.""" + +from __future__ import annotations + +from sqlalchemy import select +from sqlalchemy.ext.asyncio.session import AsyncSession +from zeus.optimizer.batch_size.server.database.repository import DatabaseRepository +from zeus.optimizer.batch_size.server.database.schema import JobTable +from zeus.optimizer.batch_size.server.exceptions import ( + ZeusBSOServiceBadOperationError, + ZeusBSOValueError, +) +from zeus.optimizer.batch_size.server.job.commands import ( + CreateJob, + UpdateExpDefaultBs, + UpdateGeneratorState, + UpdateJobMinCost, + UpdateJobStage, +) +from zeus.optimizer.batch_size.server.job.models import JobState +from zeus.util.logging import get_logger + +logger = get_logger(__name__) + + +class JobStateRepository(DatabaseRepository): + """Repository that provides basic interfaces to interact with Job table.""" + + def __init__(self, session: AsyncSession): + """Set db session and intialize job. We are working with only one job per session.""" + super().__init__(session) + self.fetched_job: JobTable | None = None + + async def get_job(self, job_id: str) -> JobState | None: + """Get job State, which includes jobSpec + batch_sizes(list[int]), without specific states of each batch_size. + + Args: + job_id: Job id. + + Returns: + set fetched_job and return `JobState` if we found a job, unless return None. + """ + stmt = select(JobTable).where(JobTable.job_id == job_id) + job = await self.session.scalar(stmt) + + if job is None: + logger.info("get_job: NoResultFound") + return None + + self.fetched_job = job + return JobState.from_orm(job) + + def get_job_from_session(self, job_id: str) -> JobState | None: + """Get a job that was fetched from this session. + + Args: + job_id: Job id. + + Returns: + Corresponding `JobState`. If none was found, return None. + """ + if self.fetched_job is None or self.fetched_job.job_id != job_id: + return None + return self.fetched_job + + def update_exp_default_bs(self, updated_bs: UpdateExpDefaultBs) -> None: + """Update exploration default batch size on fetched job. + + Args: + updated_bs: Job Id and new batch size. + """ + if self.fetched_job is None: + raise ZeusBSOServiceBadOperationError("No job is fetched.") + + if updated_bs.job_id == self.fetched_job.job_id: + self.fetched_job.exp_default_batch_size = updated_bs.exp_default_batch_size + else: + raise ZeusBSOValueError( + f"Unknown job_id ({updated_bs.job_id}). Expecting {self.fetched_job.job_id}" + ) + + def update_stage(self, updated_stage: UpdateJobStage) -> None: + """Update stage on fetched job. + + Args: + updated_stage: Job Id and new stage. + """ + if self.fetched_job is None: + raise ZeusBSOServiceBadOperationError("No job is fetched.") + + if self.fetched_job.job_id == updated_stage.job_id: + self.fetched_job.stage = updated_stage.stage + else: + raise ZeusBSOValueError( + f"Unknown job_id ({updated_stage.job_id}). Expecting {self.fetched_job.job_id}" + ) + + def update_min(self, updated_min: UpdateJobMinCost) -> None: + """Update exploration min training cost and corresponding batch size on fetched job. + + Args: + updated_min: Job Id, new min cost and batch size. + """ + if self.fetched_job is None: + raise ZeusBSOServiceBadOperationError("No job is fetched.") + + if self.fetched_job.job_id == updated_min.job_id: + self.fetched_job.min_cost = updated_min.min_cost + self.fetched_job.min_cost_batch_size = updated_min.min_cost_batch_size + else: + raise ZeusBSOValueError( + f"Unknown job_id ({updated_min.job_id}). Expecting {self.fetched_job.job_id}" + ) + + def update_generator_state(self, updated_state: UpdateGeneratorState) -> None: + """Update generator state on fetched job. + + Args: + updated_state: Job Id and new generator state. + """ + if self.fetched_job is None: + raise ZeusBSOServiceBadOperationError("No job is fetched.") + + if self.fetched_job.job_id == updated_state.job_id: + self.fetched_job.mab_random_generator_state = updated_state.state + else: + raise ZeusBSOValueError( + f"Unknown job_id ({updated_state.job_id}). Expecting {self.fetched_job.job_id}" + ) + + def create_job(self, new_job: CreateJob) -> None: + """Create a new job by adding a new job to the session. + + Args: + new_job: Job configuration for a new job. + """ + self.session.add(new_job.to_orm()) + + def check_job_fetched(self, job_id: str) -> bool: + """Check if this job is already fetched before. + + Args: + job_id: Job id. + + Returns: + True if this job was fetched and in session. Otherwise, return false. + """ + return not (self.fetched_job is None or self.fetched_job.job_id != job_id) + + async def delete_job(self, job_id: str) -> bool: + """Delete the job of a given job_Id. + + Args: + job_id: Job id. + + Returns: + True if the job got deleted. + """ + stmt = select(JobTable).where(JobTable.job_id == job_id) + job = await self.session.scalar(stmt) + + if job is None: + return False + + # We can't straight delete using a query, since some db such as sqlite + # Foreign Key is default to OFF, so "on delete = cascade" will not be fired. + await self.session.delete(job) + return True diff --git a/zeus/optimizer/batch_size/server/mab.py b/zeus/optimizer/batch_size/server/mab.py new file mode 100644 index 00000000..26a5a913 --- /dev/null +++ b/zeus/optimizer/batch_size/server/mab.py @@ -0,0 +1,299 @@ +"""Thompson Sampling policy for Gaussian bandits. MAB related logic is implented here.""" + +from __future__ import annotations + +import numpy as np +from zeus.optimizer.batch_size.server.batch_size_state.commands import ( + ReadTrial, + UpdateTrial, +) +from zeus.optimizer.batch_size.server.batch_size_state.models import ( + BatchSizeBase, + ExplorationsPerJob, + GaussianTsArmState, +) +from zeus.optimizer.batch_size.server.exceptions import ( + ZeusBSOServiceBadOperationError, + ZeusBSOValueError, +) +from zeus.optimizer.batch_size.server.job.commands import UpdateJobStage +from zeus.optimizer.batch_size.server.job.models import JobState, Stage +from zeus.optimizer.batch_size.server.services.commands import ( + GetNormal, + GetRandomChoices, + UpdateArm, +) +from zeus.optimizer.batch_size.server.services.service import ZeusService +from zeus.util.logging import get_logger +from zeus.util.metric import zeus_cost + +logger = get_logger(__name__) + + +class GaussianTS: + """Thompson Sampling policy for Gaussian bandits. + + For each arm, the reward is modeled as a Gaussian distribution with + known precision. The conjugate priors are also Gaussian distributions. + """ + + def __init__(self, service: ZeusService): + """Set up zeus service to interact with database.""" + self.service = service + self.name = "GaussianTS" + + def _fit_arm( + self, + bs_base: BatchSizeBase, + prior_mean: float, + prior_precision: float, + rewards: np.ndarray, + ) -> GaussianTsArmState: + """Update the parameter distribution for one arm. + + Reference: + + Args: + bs_base: job id and batch size tha represents this arm + prior_mean: Mean of the belief prior distribution. + prior_precision: Precision of the belief prior distribution. + rewards: Array of rewards observed by pulling that arm. + + Returns: + Updated arm state + """ + if len(rewards) == 0: + return + + variance = np.var(rewards) + reward_prec = np.inf if variance == 0.0 else np.reciprocal(variance) + + # Reset to priors + mean = prior_mean + prec = prior_precision + + # Compute the parameters of the posterior distribution. + # The reward distribution's precision is given as infinite only when we + # have exactly one observation for the arm, s.t. sampling yields that + # exact observation. + if reward_prec == np.inf: + new_prec = np.inf + new_mean = rewards.mean() + else: + new_prec = prec + len(rewards) * reward_prec + new_mean = (prec * mean + reward_prec * rewards.sum()) / new_prec + + # Updated state. + return GaussianTsArmState( + job_id=bs_base.job_id, + batch_size=bs_base.batch_size, + param_mean=new_mean, + param_precision=new_prec, + reward_precision=reward_prec, + num_observations=len(rewards), + ) + + def predict( + self, + job_id: str, + prior_precision: float, + num_exploration: int, + arms: list[GaussianTsArmState], + ) -> int: + """Return the arm with the largest sampled expected reward. + + Args: + job_id: job id + prior_precision: Precision of the belief prior distribution. + num_exploration: How many static explorations to run when no observations are available. + arms: list of arms + + Returns: + batch size to use + """ + arm_dict = {arm.batch_size: arm for arm in arms} + + # Exploration-only phase. + # Order is random considering concurrent bandit scenarios. + choices = self.service.get_random_choices( + GetRandomChoices(job_id=job_id, choices=[arm.batch_size for arm in arms]) + ) + + for arm in choices: + if arm_dict[arm].num_observations < num_exploration: + logger.info("[%s] Explore arm %s.", self.name, str(arm)) + return arm + + # Thomopson Sampling phase. + # Sample the expected reward for each arm. + # Assumes that each arm has been explored at least once. Otherwise, + # a value will be sampled from the prior. + + expectations = {} # A mapping from every arm to their sampled expected reward. + for arm in arms: + if arm.param_precision == prior_precision: + logger.warning( + "predict_expectations called when arm '%d' is cold.", + arm.batch_size, + stacklevel=1, + ) + expectations[arm.batch_size] = self.service.get_normal( + GetNormal( + job_id=job_id, + loc=arm.param_mean, + scale=np.sqrt(np.reciprocal(arm.param_precision)), + ) + ) + + logger.info("[%s] Sampled mean rewards:", self.name) + for arm, sample in expectations.items(): + logger.info( + "[%s] Arm %d: mu ~ N(%.2f, %.2f) -> %.2f", + self.name, + arm, + arm_dict[arm].param_mean, + 1 / arm_dict[arm].param_precision, + sample, + ) + + bs = max(expectations, key=expectations.get) + logger.info("%s in Thompson Sampling stage -> BS = %d", job_id, bs) + return bs + + async def construct_mab( + self, job: JobState, evidence: ExplorationsPerJob, good_bs: list[int] + ) -> list[GaussianTsArmState]: + """Construct arms and initialize them. + + Args: + job: state of job. + evidence: Completed explorations. We create arms based on the explorations we have done during pruning stage. + good_bs: Converged batch size list. + + Returns: + list of arms that we created + + Raises: + `ValueError`: If exploration states is invalid (ex. number of pruning rounds doesn't corresponds) + `ZeusBSOValueError`: No converged batch sizes from pruning stage. + """ + if job.job_id != evidence.job_id: + raise ZeusBSOServiceBadOperationError( + f"Job Id is not consistent: job({job.job_id}) != explorations({evidence.job_id})" + ) + + if len(good_bs) == 0: + raise ZeusBSOValueError("While creating arms, no batch size is selected") + + logger.info( + "Construct MAB for %s with arms %s", + job.job_id, + str(good_bs), + ) + + new_arms: list[GaussianTsArmState] = [] + + # Fit the arm for each good batch size. + for _, bs in enumerate(good_bs): + rewards = [] + # Collect rewards starting from the most recent ones and backwards. + for trial in evidence.explorations_per_bs[bs]: + rewards.append( + -zeus_cost(trial.energy, trial.time, job.eta_knob, job.max_power) + ) + + new_arms.append( + # create an arm + self._fit_arm( + BatchSizeBase(job_id=job.job_id, batch_size=bs), + job.mab_prior_mean, + job.mab_prior_precision, + np.array(rewards), + ) + ) + + # submit new arms to db + self.service.create_arms(new_arms) + # update job stage from pruning to mab since we created arms + self.service.update_job_stage( + UpdateJobStage(job_id=job.job_id, stage=Stage.MAB) + ) + return new_arms + + async def report(self, job: JobState, trial_result: UpdateTrial) -> None: + """Based on the measurement, update the arm state. + + Args: + job: state of the job + trial_result: result of training (job id, batch_size, trial_number) + + Raises: + `ZeusBSOValueError`: When the arm (job id, batch_size) doesn't exist + """ + # Since we're learning the reward precision, we need to + # 1. re-compute the precision of this arm based on the reward history, + # 2. update the arm's reward precision + # 3. and `fit` the new MAB instance on all the reward history. + # Note that `arm_rewards` always has more than one entry (and hence a + # non-zero variance) because we've been through pruning exploration. + batch_size_key = BatchSizeBase( + job_id=job.job_id, batch_size=trial_result.batch_size + ) + + # Get measurements of this bs in descending order. At most window_size length + history = await self.service.get_trial_results_of_bs(batch_size_key) + + if len(history.results) >= job.window_size and job.window_size > 0: + # if the history is already above the window size, pop the last one to leave the spot for the current measurement. + history.results.pop() + history.results.reverse() # Now ascending order. + + costs = [ + -zeus_cost(m.energy, m.time, job.eta_knob, job.max_power) + for m in history.results + ] + # Add current measurement to the costs + costs.append( + -zeus_cost( + trial_result.energy, trial_result.time, job.eta_knob, job.max_power + ) + ) + arm_rewards = np.array(costs) + + logger.info("Arm_rewards: %s", str(arm_rewards)) + + # Get current arm. + arm = await self.service.get_arm(batch_size_key) + + if arm is None: + raise ZeusBSOValueError( + f"MAB stage but Arm for batch size({trial_result.batch_size}) is not found." + ) + + # Get a new arm state based on observation + new_arm = self._fit_arm( + batch_size_key, job.mab_prior_mean, job.mab_prior_precision, arm_rewards + ) + + # update the new arm state in db + self.service.update_arm_state( + UpdateArm( + trial=ReadTrial( + job_id=trial_result.job_id, + batch_size=trial_result.batch_size, + trial_number=trial_result.trial_number, + ), + updated_arm=new_arm, + ) + ) + # update corresponding trial + self.service.update_trial(trial_result) + + arm_rewards_repr = ", ".join([f"{r:.2f}" for r in arm_rewards]) + logger.info( + "%s @ %d: arm_rewards = [%s], reward_prec = %.2f", + job.job_id, + trial_result.batch_size, + arm_rewards_repr, + new_arm.reward_precision, + ) diff --git a/zeus/optimizer/batch_size/server/optimizer.py b/zeus/optimizer/batch_size/server/optimizer.py new file mode 100644 index 00000000..d35ee04c --- /dev/null +++ b/zeus/optimizer/batch_size/server/optimizer.py @@ -0,0 +1,280 @@ +"""Batch size optimizer top-most layer that provides register/report/predict.""" + +from __future__ import annotations +import hashlib +import time + +import numpy as np +from zeus.optimizer.batch_size.common import ( + JobSpecFromClient, + TrialId, + ReportResponse, + TrainingResult, +) +from zeus.optimizer.batch_size.server.batch_size_state.commands import ( + CreateMabTrial, + ReadTrial, + UpdateTrial, +) +from zeus.optimizer.batch_size.server.database.schema import TrialStatus +from zeus.optimizer.batch_size.server.exceptions import ( + ZeusBSOJobConfigMismatchError, + ZeusBSOServerNotFoundError, + ZeusBSOServiceBadOperationError, + ZeusBSOValueError, +) +from zeus.optimizer.batch_size.server.explorer import PruningExploreManager +from zeus.optimizer.batch_size.server.job.commands import CreateJob +from zeus.optimizer.batch_size.server.job.models import Stage +from zeus.optimizer.batch_size.server.mab import GaussianTS +from zeus.optimizer.batch_size.server.services.service import ZeusService +from zeus.util.logging import get_logger +from zeus.util.metric import zeus_cost + +logger = get_logger(__name__) + + +class ZeusBatchSizeOptimizer: + """Batch size optimizer server. Manages which stage the job is in and call corresponding manager (pruning or mab).""" + + def __init__(self, service: ZeusService) -> None: + """Initialize the server. Set the service, pruning manager, and mab. + + Args: + service: ZeusService for interacting with database + """ + self.service = service + self.pruning_manager = PruningExploreManager(service) + self.mab = GaussianTS(service) + + async def register_job(self, job: JobSpecFromClient) -> bool: + """Register a job that user submitted. If the job id already exists, check if it is identical with previously registered configuration. + + Args: + job: job configuration + + Returns: + True if a job is regiested, False if a job already exists and identical with previous configuration + + Raises: + [`ZeusBSOJobConfigMismatchError`][zeus.optimizer.batch_size.server.exceptions.ZeusBSOJobConfigMismatchError]: In the case of existing job, if job configuration doesn't match with previously registered config + """ + registered_job = None + + if job.job_id is None: + while True: + job.job_id = f"{job.job_id_prefix}-{hashlib.sha256(str(time.time()).encode()).hexdigest()[:8]}" + if (await self.service.get_job(job.job_id)) is None: + break + else: + registered_job = await self.service.get_job(job.job_id) + + if registered_job is not None: + # Job exists + logger.info("Job(%s) already exists", job.job_id) + registerd_job_config = JobSpecFromClient.parse_obj(registered_job.dict()) + + # check if it is identical + if registerd_job_config != job: + raise ZeusBSOJobConfigMismatchError( + "JobSpec doesn't match with existing jobSpec. Use a new job_id for different configuration" + ) + return False + + self.service.create_job(CreateJob.from_job_config(job)) + logger.info("Registered %s", job.job_id) + + return True + + async def predict(self, job_id: str) -> TrialId: + """Return a batch size to use. + + Args: + job_id: Id of job + + Returns: + batch size to use + + Raises: + [`ZeusBSOValueError`][zeus.optimizer.batch_size.server.exceptions.ZeusBSOValueError]: If the job id is unknown, or creating a mab failed due to no converged batch size + """ + job = await self.service.get_job(job_id) + + if job is None: + raise ZeusBSOValueError( + f"Unknown job({job_id}). Please register the job first" + ) + + if job.stage == Stage.MAB: + # If we are in MAB stage, use mab to get the next batch size + arms = await self.service.get_arms(job_id) + next_trial = await self.service.create_trial( + CreateMabTrial( + job_id=job_id, + batch_size=self.mab.predict( + job_id, job.mab_prior_precision, job.mab_num_explorations, arms + ), + ) + ) + else: + # Pruning stage + explorations = await self.service.get_explorations_of_job(job_id) + # First check if pruning explorer can give us any batch size. Returns batch_size or MAB to indicate going to MAB stage + res = await self.pruning_manager.next_batch_size(job, explorations) + + if isinstance(res, list): + # MAB stage: construct MAB and update the job stage to MAB. Return the batch size from MAB + logger.info("Constructing a MAB") + arms = await self.mab.construct_mab(job, explorations, res) + next_trial = await self.service.create_trial( + CreateMabTrial( + job_id=job_id, + batch_size=self.mab.predict( + job_id, + job.mab_prior_precision, + job.mab_num_explorations, + arms, + ), + ) + ) + else: + next_trial = res + + return TrialId( + job_id=next_trial.job_id, + batch_size=next_trial.batch_size, + trial_number=next_trial.trial_number, + ) + + async def report(self, result: TrainingResult) -> ReportResponse: + """Report the training result. Stop train if the train is converged or reached max epochs or reached early stop threshold. Otherwise, keep training. + + Args: + result: result of training [`TrainingResult`][zeus.optimizer.batch_size.common.TrainingResult]. + + Returns: + Decision on training [`ReportResponse`][zeus.optimizer.batch_size.common.ReportResponse]. + """ + cost_ub = np.inf + job = await self.service.get_job(result.job_id) + trial = await self.service.get_trial( + ReadTrial( + job_id=result.job_id, + batch_size=result.batch_size, + trial_number=result.trial_number, + ) + ) + + if trial.status != TrialStatus.Dispatched: + # result is already reported + return ReportResponse( + stop_train=True, + converged=trial.converged, + message=f"Result for this trial({trial.trial_number}) is already reported.", + ) + + if trial is None: + raise ZeusBSOServiceBadOperationError(f"Unknown trial {result}") + + if job.beta_knob is not None and job.min_cost is not None: # Early stop enabled + cost_ub = job.beta_knob * job.min_cost + + reported_cost = zeus_cost( + result.energy, + result.time, + job.eta_knob, + job.max_power, + ) + + within_cost_range = cost_ub >= reported_cost + converged = ( + job.higher_is_better_metric and job.target_metric <= result.metric + ) or (not job.higher_is_better_metric and job.target_metric >= result.metric) + + if ( + within_cost_range + and result.current_epoch < job.max_epochs + and not converged + ): + # If it's not converged but below cost upper bound and haven't reached max_epochs, keep training + + return ReportResponse( + stop_train=False, + converged=False, + message="Stop condition not met, keep training", + ) + + # Two cases below here (training ended) + # 1. Converged == true + # 2. reached max_epoch OR excceded upper bound cost (error case) + if converged and within_cost_range: + message = "Train succeeded" + elif not within_cost_range: + message = f"""Batch Size({result.batch_size}) exceeded the cost upper bound: current cost({reported_cost}) > + beta_knob({job.beta_knob})*min_cost({job.min_cost})""" + else: + # not converged + message = f"Train failed to converge within max_epoch({job.max_epochs})" + + trial_result = UpdateTrial( + job_id=result.job_id, + batch_size=result.batch_size, + status=TrialStatus.Succeeded, + trial_number=result.trial_number, + time=result.time, + energy=result.energy, + converged=converged and within_cost_range, + ) + + if job.stage == Stage.MAB: + await self.mab.report(job, trial_result) + else: + # Pruning stage + logger.info( + "%s in pruning stage, Current BS %s that did %s converge.", + result.job_id, + result.batch_size, + "not" * (not converged), + ) + # update trial + self.service.update_trial(trial_result) + + return ReportResponse( + stop_train=True, converged=trial_result.converged, message=message + ) + + async def end_trial(self, trial_id: TrialId) -> None: + """Mark the trial as finished. If status is still `Dispatched` make the trial as `Failed`. + + Args: + trial_id: Unique identifier of trial + + Raises: + [`ZeusBSOServerNotFound`][zeus.optimizer.batch_size.server.exceptions.ZeusBSOServerNotFound]: If there is no corresponding trial. + """ + trial = await self.service.get_trial(ReadTrial(**trial_id.dict())) + + if trial is not None: + if trial.status == TrialStatus.Dispatched: + self.service.update_trial( + UpdateTrial( + job_id=trial_id.job_id, + batch_size=trial_id.batch_size, + trial_number=trial_id.trial_number, + status=TrialStatus.Failed, + ) + ) + else: + raise ZeusBSOServerNotFoundError(f"Could not find the trial: {trial_id}") + + async def delete_job(self, job_id: str) -> None: + """Delete a job. + + Args: + job_id: ID of a job. + + Returns: + True if the job is deleted. False if none was deleted + """ + if not (await self.service.delete_job(job_id)): + raise ZeusBSOServerNotFoundError("No job was deleted.") diff --git a/zeus/optimizer/batch_size/server/router.py b/zeus/optimizer/batch_size/server/router.py new file mode 100644 index 00000000..fb138428 --- /dev/null +++ b/zeus/optimizer/batch_size/server/router.py @@ -0,0 +1,193 @@ +"""Zeus batch size optimizer server FAST API router.""" + +import asyncio +import logging +from collections import defaultdict + +from fastapi import Depends, FastAPI, Response, status +from fastapi.responses import JSONResponse +from sqlalchemy.ext.asyncio import AsyncSession +from zeus.optimizer.batch_size.common import ( + DELETE_JOB_URL, + GET_NEXT_BATCH_SIZE_URL, + REGISTER_JOB_URL, + REPORT_END_URL, + REPORT_RESULT_URL, + CreatedJob, + JobSpecFromClient, + TrialId, + ReportResponse, + TrainingResult, +) +from zeus.optimizer.batch_size.server.config import settings +from zeus.optimizer.batch_size.server.database.db_connection import get_db_session +from zeus.optimizer.batch_size.server.exceptions import ( + ZeusBSOServerBaseError, +) +from zeus.optimizer.batch_size.server.optimizer import ZeusBatchSizeOptimizer +from zeus.optimizer.batch_size.server.services.service import ZeusService +from zeus.util.logging import get_logger + +app = FastAPI() +# Global variable across different requests: https://github.com/tiangolo/fastapi/issues/592 +# We lock the job before we make any modification to prevent any concurrent bugs. +app.job_locks = defaultdict(asyncio.Lock) +app.prefix_locks = defaultdict(asyncio.Lock) + +logger = get_logger(__name__) +logging.basicConfig(level=logging.getLevelName(settings.log_level)) + + +@app.on_event("startup") +def startup_hook(): + """Startup hook.""" + pass + + +@app.post( + REGISTER_JOB_URL, + responses={ + 200: {"description": "Job is already registered"}, + 201: {"description": "Job is successfully registered"}, + }, + response_model=JobSpecFromClient, +) +async def register_job( + job: JobSpecFromClient, + response: Response, + db_session: AsyncSession = Depends(get_db_session), +) -> CreatedJob: + """Endpoint for users to register a job or check if the job is registered and configuration is identical.""" + async with app.prefix_locks[job.job_id_prefix]: + # One lock for registering a job. To prevent getting a same lock + optimizer = ZeusBatchSizeOptimizer(ZeusService(db_session)) + try: + created = await optimizer.register_job(job) + await db_session.commit() + if created: + # new job is created + response.status_code = status.HTTP_201_CREATED + else: + # job already exists + response.status_code = status.HTTP_200_OK + return job + except ZeusBSOServerBaseError as err: + await db_session.rollback() + return JSONResponse( + status_code=err.status_code, + content={"message": err.message}, + ) + except Exception as err: + await db_session.rollback() + logger.error("Commit Failed: %s", str(err)) + return JSONResponse( + status_code=500, + content={"message": str(err)}, + ) + + +@app.delete(DELETE_JOB_URL) +async def delete_job( + job_id: str, db_session: AsyncSession = Depends(get_db_session) +) -> None: + """Endpoint for users to delete a job.""" + async with app.job_locks[job_id]: + try: + optimizer = ZeusBatchSizeOptimizer(ZeusService(db_session)) + await optimizer.delete_job(job_id) + await db_session.commit() + except ZeusBSOServerBaseError as err: + await db_session.rollback() + return JSONResponse( + status_code=err.status_code, + content={"message": err.message}, + ) + except Exception as err: + await db_session.rollback() + logger.error("Commit Failed: %s", str(err)) + return JSONResponse( + status_code=500, + content={"message": str(err)}, + ) + finally: + app.job_locks.pop(job_id) + + +@app.patch(REPORT_END_URL) +async def end_trial( + trial: TrialId, db_session: AsyncSession = Depends(get_db_session) +) -> None: + """Endpoint for users to end the trial.""" + async with app.job_locks[trial.job_id]: + optimizer = ZeusBatchSizeOptimizer(ZeusService(db_session)) + try: + await optimizer.end_trial(trial) + await db_session.commit() + except ZeusBSOServerBaseError as err: + await db_session.rollback() + return JSONResponse( + status_code=err.status_code, + content={"message": err.message}, + ) + except Exception as err: + await db_session.rollback() + logger.error("Commit Failed: %s", str(err)) + return JSONResponse( + status_code=500, + content={"message": str(err)}, + ) + + +@app.get(GET_NEXT_BATCH_SIZE_URL, response_model=TrialId) +async def predict( + job_id: str, + db_session: AsyncSession = Depends(get_db_session), +) -> TrialId: + """Endpoint for users to receive a batch size.""" + async with app.job_locks[job_id]: + optimizer = ZeusBatchSizeOptimizer(ZeusService(db_session)) + try: + res = await optimizer.predict(job_id) + await db_session.commit() + return res + except ZeusBSOServerBaseError as err: + await db_session.rollback() + return JSONResponse( + status_code=err.status_code, + content={"message": err.message}, + ) + except Exception as err: + await db_session.rollback() + logger.error("Commit Failed: %s", str(err)) + return JSONResponse( + status_code=500, + content={"message": str(err)}, + ) + + +@app.post(REPORT_RESULT_URL, response_model=ReportResponse) +async def report( + result: TrainingResult, + db_session: AsyncSession = Depends(get_db_session), +) -> ReportResponse: + """Endpoint for users to report the training result.""" + async with app.job_locks[result.job_id]: + optimizer = ZeusBatchSizeOptimizer(ZeusService(db_session)) + try: + logger.info("Report with result %s", str(result)) + res = await optimizer.report(result) + await db_session.commit() + return res + except ZeusBSOServerBaseError as err: + await db_session.rollback() + return JSONResponse( + status_code=err.status_code, + content={"message": err.message}, + ) + except Exception as err: + await db_session.rollback() + logger.error("Commit Failed: %s", str(err)) + return JSONResponse( + status_code=500, + content={"message": str(err)}, + ) diff --git a/zeus/optimizer/batch_size/server/services/__init__.py b/zeus/optimizer/batch_size/server/services/__init__.py new file mode 100644 index 00000000..f4111cf6 --- /dev/null +++ b/zeus/optimizer/batch_size/server/services/__init__.py @@ -0,0 +1 @@ +"""Service layer on top of repository layer. Provides core methods to interact with database.""" diff --git a/zeus/optimizer/batch_size/server/services/commands.py b/zeus/optimizer/batch_size/server/services/commands.py new file mode 100644 index 00000000..186c7d69 --- /dev/null +++ b/zeus/optimizer/batch_size/server/services/commands.py @@ -0,0 +1,43 @@ +"""Commands on how to use some methods from the `ZeusService`.""" + +from zeus.util.pydantic_v1 import BaseModel +from zeus.optimizer.batch_size.server.batch_size_state.commands import ReadTrial +from zeus.optimizer.batch_size.server.batch_size_state.models import GaussianTsArmState + + +class GetRandomChoices(BaseModel): + """Parameters for getting a random choices. + + Attributes: + job_id: Job Id + choices: List of choices + """ + + job_id: str + choices: list[int] + + +class GetNormal(BaseModel): + """Parameters for getting a random sample from normal distribution. + + Attributes: + job_id: Job id + loc: Mean + scale: Stdev + """ + + job_id: str + loc: float + scale: float + + +class UpdateArm(BaseModel): + """Parameters to update an arm. + + Attributes: + trial: Identifier of trial + updated_arm: Updated state of arm. + """ + + trial: ReadTrial + updated_arm: GaussianTsArmState diff --git a/zeus/optimizer/batch_size/server/services/service.py b/zeus/optimizer/batch_size/server/services/service.py new file mode 100644 index 00000000..879c2f1c --- /dev/null +++ b/zeus/optimizer/batch_size/server/services/service.py @@ -0,0 +1,393 @@ +"""Zeus batch size optimizer service layer.""" + +from __future__ import annotations + +import json +from datetime import datetime +from typing import Any, Tuple + +import numpy as np +from numpy.random import Generator as np_Generator +from sqlalchemy.ext.asyncio.session import AsyncSession +from zeus.optimizer.batch_size.server.batch_size_state.commands import ( + CreateConcurrentTrial, + CreateExplorationTrial, + CreateMabTrial, + CreateTrial, + ReadTrial, + UpdateTrial, +) +from zeus.optimizer.batch_size.server.batch_size_state.models import ( + BatchSizeBase, + ExplorationsPerJob, + GaussianTsArmState, + Trial, + TrialResultsPerBs, +) +from zeus.optimizer.batch_size.server.batch_size_state.repository import ( + BatchSizeStateRepository, +) +from zeus.optimizer.batch_size.server.database.schema import TrialStatus, TrialType +from zeus.optimizer.batch_size.server.exceptions import ( + ZeusBSOServiceBadOperationError, + ZeusBSOValueError, +) +from zeus.optimizer.batch_size.server.job.commands import ( + CreateJob, + UpdateExpDefaultBs, + UpdateGeneratorState, + UpdateJobMinCost, + UpdateJobStage, +) +from zeus.optimizer.batch_size.server.job.models import JobState +from zeus.optimizer.batch_size.server.job.repository import JobStateRepository +from zeus.optimizer.batch_size.server.services.commands import ( + GetNormal, + GetRandomChoices, + UpdateArm, +) +from zeus.util.metric import zeus_cost + + +class ZeusService: + """Zeus Service that interacts with database using repository. + + Provides application layer methods to communicate with database. + Each method is one or more number of db operations that have to be done at the same time. + """ + + def __init__(self, db_session: AsyncSession): + """Set up repositories to use to talk to database.""" + self.bs_repo = BatchSizeStateRepository(db_session) + self.job_repo = JobStateRepository(db_session) + + async def get_arms(self, job_id: str) -> list[GaussianTsArmState]: + """Get GaussianTs arm states for all arms(job_id, batch size). + + Args: + job_id: Job id + + Returns: + list of arms + """ + return await self.bs_repo.get_arms(job_id) + + async def get_arm(self, bs: BatchSizeBase) -> GaussianTsArmState | None: + """Get arm state for one arm. + + Args: + bs: (job_id, batch size) pair that represents one arm + + Returns: + Result arm state or None if we cannot find that arm + """ + return await self.bs_repo.get_arm(bs) + + async def get_explorations_of_job(self, job_id: str) -> ExplorationsPerJob: + """Get all explorations we have done for that job. + + Args: + job_id: Job id + + Returns: + list of explorations per each batch size + """ + return await self.bs_repo.get_explorations_of_job(job_id) + + def update_trial(self, updated_trial: UpdateTrial) -> None: + """Update trial. + + (1) update the corresponding trial. + (2) we update the min training cost observed so far if we have to. + + Args: + updated_trial: Result of training that batch size + + Raises: + [`ZeusBSOServiceBadOperationError`][zeus.optimizer.batch_size.server.exceptions.ZeusBSOServiceBadOperationError]: When we didn't fetch the job or trial during this session. This operation should have + fetched the job and trial first. Also, check if trial type is matching with fetched trial's type. + """ + trial = self._get_trial( + ReadTrial( + job_id=updated_trial.job_id, + batch_size=updated_trial.batch_size, + trial_number=updated_trial.trial_number, + ) + ) + if trial.status != TrialStatus.Dispatched: + raise ZeusBSOServiceBadOperationError("Trial already has a result.") + + self.bs_repo.updated_current_trial(updated_trial) + + if updated_trial.status != TrialStatus.Failed: + job = self._get_job(updated_trial.job_id) + self._update_min_if_needed(updated_trial, job) + + def update_arm_state( + self, + arm: UpdateArm, + ) -> None: + """Update arm state. + + Args: + arm: Updated arm state. + + Raises: + `ZeusBSOServiceBadOperationError`: When we didn't fetch the job or trial during this session. This operation should have + fetched the job and trial first. Also, check if trial type is matching with fetched trial's type. + """ + self._check_job_fetched(arm.trial.job_id) + trial = self._get_trial( + ReadTrial( + job_id=arm.trial.job_id, + batch_size=arm.trial.batch_size, + trial_number=arm.trial.trial_number, + ) + ) + if trial.type != TrialType.MAB: + raise ZeusBSOServiceBadOperationError( + "Cannot update an arm since this trial is not issued from MAB stage." + ) + self.bs_repo.update_arm_state(arm.updated_arm) + + def update_exp_default_bs(self, updated_default_bs: UpdateExpDefaultBs) -> None: + """Update the default batch size for exploration. + + Args: + updated_default_bs: Job Id and new default batch size + + Raises: + `ZeusBSOServiceBadOperationError`: When we didn't fetch the job during this session. This operation should have + fetched the job first. + """ + self._check_job_fetched(updated_default_bs.job_id) + self.job_repo.update_exp_default_bs(updated_default_bs) + + async def create_trial( + self, trial: CreateExplorationTrial | CreateMabTrial | CreateConcurrentTrial + ) -> ReadTrial: + """Create a new trial. + + Args: + trial: New trial to create. + + Raises: + `ZeusBSOServiceBadOperationError`: When we didn't fetch the job during this session. This operation should have + fetched the job first. + """ + self._check_job_fetched(trial.job_id) + trial_number = await self.bs_repo.get_next_trial_number(trial.job_id) + self.bs_repo.create_trial( + CreateTrial(**trial.dict(), trial_number=trial_number) + ) + return ReadTrial( + job_id=trial.job_id, batch_size=trial.batch_size, trial_number=trial_number + ) + + def get_random_choices(self, choice: GetRandomChoices) -> np.ndarray[Any, Any]: + """Get randome choices based on job's seed. + + If seed is not None (set by the user) we get the random choices from the generator that is stored in the database. + Otherwise, we get random choices based on random seed. + + Args: + choice: Job id and list of choices + + Returns: + reuslt random choices + + Raises: + `ZeusBSOServiceBadOperationError`: When we didn't fetch the job during this session. This operation should have + fetched the job first. + """ + arr = np.array(choice.choices) + rng, should_update = self._get_generator(choice.job_id) + res = rng.choice(arr, len(arr), replace=False) + + if should_update: + # If we used the generator from database, should update the generator state after using it + self.job_repo.update_generator_state( + UpdateGeneratorState( + job_id=choice.job_id, state=json.dumps(rng.__getstate__()) + ) + ) + + return res + + def get_normal(self, arg: GetNormal) -> float: + """Sample from normal distribution and update the generator state if seed was set. + + Args: + arg: args for `numpy.random.normal`, which is loc(mean of distribution) and scale(stdev of distribution) + + Returns: + Drawn sample. + + Raises: + `ZeusBSOServiceBadOperationError`: When we didn't fetch the job during this session. This operation should have + fetched the job first. + """ + rng, should_update = self._get_generator(arg.job_id) + res = rng.normal(arg.loc, arg.scale) + + if should_update: + # If we used the generator from database, should update the generator state after using it + self.job_repo.update_generator_state( + UpdateGeneratorState( + job_id=arg.job_id, state=json.dumps(rng.__getstate__()) + ) + ) + + return res + + async def get_job(self, job_id: str) -> JobState | None: + """Get job from database. + + Args: + job_id: Job Id + + Returns: + JobState if we found one, None if we couldn't find a job matching the job id. + """ + return await self.job_repo.get_job(job_id) + + async def get_trial(self, trial: ReadTrial) -> Trial | None: + """Get a trial from database. + + Args: + trial: (Job Id, batch size, trial_number) triplet. + + Returns: + Trial if we found one, None if we couldn't find a job matching trial. + """ + return await self.bs_repo.get_trial(trial) + + def create_job(self, new_job: CreateJob) -> None: + """Create a new job. + + Args: + new_job: Configuration of a new job + """ + return self.job_repo.create_job(new_job) + + async def get_trial_results_of_bs(self, bs: BatchSizeBase) -> TrialResultsPerBs: + """Load window size amount of results for a given batch size. If window size <= 0, load all of them. + + Args: + bs: (job_id, batch size) pair. + + Returns: + list of windowed measurements in descending order for that (job_id, batch size) + + Raises: + `ZeusBSOServiceBadOperationError`: When we didn't fetch the job during this session. This operation should have + fetched the job first. + """ + job = self._get_job(bs.job_id) + return await self.bs_repo.get_trial_results_of_bs( + BatchSizeBase(job_id=bs.job_id, batch_size=bs.batch_size), + job.window_size, + ) + + def create_arms(self, new_arms: list[GaussianTsArmState]) -> None: + """Create GuassianTs arms for the job. + + Args: + new_arms: List of new arm states + + Raises: + `ZeusBSOServiceBadOperationError`: When we didn't fetch the job during this session. This operation should have + fetched the job first. + """ + if len(new_arms) != 0: + self._check_job_fetched(new_arms[0].job_id) + self.bs_repo.create_arms(new_arms) + + def update_job_stage(self, updated_stage: UpdateJobStage) -> None: + """Update the job stage (Pruning -> MAB). + + Args: + updated_stage: Updated stage. + + Raises: + `ZeusBSOServiceBadOperationError`: When we didn't fetch the job during this session. This operation should have + fetched the job first. + """ + self._check_job_fetched(updated_stage.job_id) + self.job_repo.update_stage(updated_stage) + + async def delete_job(self, job_id: str) -> bool: + """Delete the job. + + Args: + job_id: ID of the job. + + Returns: + True if the job is deleted. False if none was deleted + """ + return await self.job_repo.delete_job(job_id) + + def _update_min_if_needed( + self, + updated_trial: UpdateTrial, + job: JobState, + ): + """Update the min training cost and corresponding batch size based on the trianing result.""" + cur_cost = zeus_cost( + updated_trial.energy, updated_trial.time, job.eta_knob, job.max_power + ) + if job.min_cost is None or job.min_cost > cur_cost: + self.job_repo.update_min( + UpdateJobMinCost( + job_id=job.job_id, + min_cost=cur_cost, + min_cost_batch_size=updated_trial.batch_size, + ) + ) + + def _get_generator(self, job_id: str) -> Tuple[np_Generator, bool]: + """Get generator based on job_id. If mab_seed is not none, we should update the state after using generator. + + Returns: + Tuple of [Generator, if we should update state] + """ + job_state = self._get_job(job_id) + + rng = np.random.default_rng(int(datetime.now().timestamp())) + + should_update = job_state.mab_seed is not None + if job_state.mab_seed is not None: + if job_state.mab_random_generator_state is None: + raise ZeusBSOValueError( + "Seed is set but generator state is none. Should be impossible" + ) + + state = json.loads(job_state.mab_random_generator_state) + rng.__setstate__(state) + + return (rng, should_update) + + def _get_job(self, job_id: str) -> JobState: + """Get the job from the session. If we couldn't find the job, raise a `ZeusBSOServiceBadOperationError`.""" + res = self.job_repo.get_job_from_session(job_id) + if res is None: + raise ZeusBSOServiceBadOperationError( + f"Should have fetched the job first or job does not exist(job_id = {job_id})" + ) + return res + + def _get_trial(self, trial: ReadTrial) -> Trial: + """Get the job from the session. If we couldn't find the trial, raise a `ZeusBSOServiceBadOperationError`.""" + res = self.bs_repo.get_trial_from_session(trial) + if res is None: + raise ZeusBSOServiceBadOperationError( + f"Should have fetched the trial first or trial does not exist(trial = {trial})" + ) + return res + + def _check_job_fetched(self, job_id: str) -> None: + """Check if we fetched the job in the current session. If we didn't raise a `ZeusBSOServiceBadOperationError`.""" + if not self.job_repo.check_job_fetched(job_id): + raise ZeusBSOServiceBadOperationError( + f"check_job_fetched: {job_id} is not currently in the session" + ) diff --git a/zeus/run/master.py b/zeus/run/master.py index 719b92b4..bdf8a0db 100644 --- a/zeus/run/master.py +++ b/zeus/run/master.py @@ -29,7 +29,7 @@ from zeus.analyze import HistoryEntry from zeus.job import Job -from zeus.policy import BatchSizeOptimizer +from zeus._legacy.policy import BatchSizeOptimizer from zeus.util import zeus_cost from zeus.device import get_gpus @@ -44,7 +44,7 @@ class ZeusMaster: [`ZeusDataLoader`][zeus.run.ZeusDataLoader]'s class docstring. The optimal batch size is searched for and exploited using the - [`BatchSizeOptimizer`][zeus.policy.BatchSizeOptimizer] object passed in + [`BatchSizeOptimizer`][zeus._legacy.policy.BatchSizeOptimizer] object passed in through the constructor. """ @@ -62,7 +62,7 @@ def __init__( Args: batch_size_optimizer: The user is expected to construct the - [`BatchSizeOptimizer`][zeus.policy.BatchSizeOptimizer] with the desired + [`BatchSizeOptimizer`][zeus._legacy.policy.BatchSizeOptimizer] with the desired policy and pass it into the master class. log_base: Absolute path where logs will be stored. A separate directory will be created inside, whose name is determined by the job and current time.