From 57c077d2e8c46ad9e867e4960b6e117a6c11acee Mon Sep 17 00:00:00 2001 From: shayaharon Date: Mon, 19 Dec 2022 14:00:30 +0200 Subject: [PATCH 01/67] workflow added --- .circleci/config.yml | 28 ++++++++++++++++++- tests/recipe_training_tests/__init__.py | 0 .../cifar10_recipe_sanity_test.py | 20 +++++++++++++ 3 files changed, 47 insertions(+), 1 deletion(-) create mode 100644 tests/recipe_training_tests/__init__.py create mode 100644 tests/recipe_training_tests/cifar10_recipe_sanity_test.py diff --git a/.circleci/config.yml b/.circleci/config.yml index 8cca854720..7b3d2fe2d8 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -104,7 +104,6 @@ jobs: - store_artifacts: path: ~/sg_logs - release_candidate: parameters: py_version: @@ -180,6 +179,30 @@ jobs: tag: $CIRCLE_TAG notes: "This GitHub Release was done automatically by CircleCI" + recipe_tests: + machine: true + resource_class: deci-ai/sg-gpu-on-premise + parameters: + sg_existing_env_path: + type: string + default: "/env/persistent_env" + sg_new_env_name: + type: string + default: "${CIRCLE_BUILD_NUM}" + sg_new_env_python_version: + type: string + default: "python3.8" + steps: + - checkout + - run: + name: install requirements + command: | + << parameters.sg_new_env_python_version >> -m venv << parameters.sg_new_env_name >> + source << parameters.sg_new_env_name >>/bin/activate + python --version + python -m pip install --upgrade setuptools + python -m pip install wheel + python -m pip install -r requirements.txt workflows: @@ -220,6 +243,8 @@ workflows: branch: << pipeline.git.branch >> - deci-common/codeartifact_login: repo_name: "deci-packages" + + - recipe_tests - build: name: "build3.7" py_version: "3.7" @@ -227,6 +252,7 @@ workflows: requires: - deci-common/persist_version_info - deci-common/codeartifact_login + - release_candidate: # happens on merge py_version: "3.7" requires: diff --git a/tests/recipe_training_tests/__init__.py b/tests/recipe_training_tests/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/recipe_training_tests/cifar10_recipe_sanity_test.py b/tests/recipe_training_tests/cifar10_recipe_sanity_test.py new file mode 100644 index 0000000000..2c3e919fed --- /dev/null +++ b/tests/recipe_training_tests/cifar10_recipe_sanity_test.py @@ -0,0 +1,20 @@ +from omegaconf import DictConfig +import hydra +import pkg_resources + +from super_gradients import Trainer, init_trainer + + +@hydra.main(config_path=pkg_resources.resource_filename("super_gradients.recipes", ""), config_name="cifar10_resnet", version_base="1.2") +def main(cfg: DictConfig) -> None: + cfg["training_hyperparams"]["max_epochs"] = 10 + Trainer.train_from_config(cfg) + + +def run(): + init_trainer() + main() + + +if __name__ == "__main__": + run() From 077e43de0a959e708ff4de40c78513fb99ea6549 Mon Sep 17 00:00:00 2001 From: shayaharon Date: Mon, 19 Dec 2022 14:03:51 +0200 Subject: [PATCH 02/67] first tests added --- .circleci/config.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.circleci/config.yml b/.circleci/config.yml index 7b3d2fe2d8..b323c27e9c 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -205,6 +205,11 @@ jobs: python -m pip install -r requirements.txt + + - run: + name: cifar_10_sanity + command: | + python -m tests/recipe_training_tests/cifar10_recipe_sanity_test.py workflows: release: jobs: From 0ef726a3ccd161f3934efd2691cf03f45f405d91 Mon Sep 17 00:00:00 2001 From: shayaharon Date: Mon, 19 Dec 2022 14:19:11 +0200 Subject: [PATCH 03/67] sanity tests moved --- .circleci/config.yml | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index b323c27e9c..a8846edd13 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -195,7 +195,7 @@ jobs: steps: - checkout - run: - name: install requirements + name: install requirements and run recipe tests command: | << parameters.sg_new_env_python_version >> -m venv << parameters.sg_new_env_name >> source << parameters.sg_new_env_name >>/bin/activate @@ -203,13 +203,9 @@ jobs: python -m pip install --upgrade setuptools python -m pip install wheel python -m pip install -r requirements.txt + python -m tests/recipe_training_tests/cifar10_recipe_sanity_test.py - - - run: - name: cifar_10_sanity - command: | - python -m tests/recipe_training_tests/cifar10_recipe_sanity_test.py workflows: release: jobs: From 7f5bf046ebdeee7d89ece23802ae3e451ffbf499 Mon Sep 17 00:00:00 2001 From: shayaharon Date: Mon, 19 Dec 2022 14:24:43 +0200 Subject: [PATCH 04/67] -m removed --- .circleci/config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index a8846edd13..dfcf7b0e39 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -203,7 +203,7 @@ jobs: python -m pip install --upgrade setuptools python -m pip install wheel python -m pip install -r requirements.txt - python -m tests/recipe_training_tests/cifar10_recipe_sanity_test.py + python tests/recipe_training_tests/cifar10_recipe_sanity_test.py workflows: From bd0dc11b2a9051bad43055b996d25dff34cf6b3e Mon Sep 17 00:00:00 2001 From: shayaharon Date: Mon, 19 Dec 2022 14:30:49 +0200 Subject: [PATCH 05/67] env var added --- .circleci/config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index dfcf7b0e39..defdb54c9b 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -203,9 +203,9 @@ jobs: python -m pip install --upgrade setuptools python -m pip install wheel python -m pip install -r requirements.txt + export PYTHONPATH=/home/circleci/super_gradients/src:/home/circleci/super_gradients/ python tests/recipe_training_tests/cifar10_recipe_sanity_test.py - workflows: release: jobs: From 5e930d12f35ff7eb1d471b7c6ce37488c7353193 Mon Sep 17 00:00:00 2001 From: shayaharon Date: Mon, 19 Dec 2022 14:50:04 +0200 Subject: [PATCH 06/67] installation from branch added --- .circleci/config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index defdb54c9b..11a2ab421f 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -203,7 +203,7 @@ jobs: python -m pip install --upgrade setuptools python -m pip install wheel python -m pip install -r requirements.txt - export PYTHONPATH=/home/circleci/super_gradients/src:/home/circleci/super_gradients/ + pip install git+https://github.com/Deci-AI/super-gradients.git@${CIRCLE_BRANCH} python tests/recipe_training_tests/cifar10_recipe_sanity_test.py workflows: From 933ab7928d61f9849bfdf86eef5e9d3be61fbe68 Mon Sep 17 00:00:00 2001 From: shayaharon Date: Mon, 19 Dec 2022 17:27:44 +0200 Subject: [PATCH 07/67] more changes --- .circleci/config.yml | 4 +++- .../cifar10_recipe_sanity_test.py | 1 - .../recipe_sanity_unit_test.py | 18 ++++++++++++++++++ 3 files changed, 21 insertions(+), 2 deletions(-) create mode 100644 tests/recipe_training_tests/recipe_sanity_unit_test.py diff --git a/.circleci/config.yml b/.circleci/config.yml index 11a2ab421f..b9a9e45b26 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -204,7 +204,9 @@ jobs: python -m pip install wheel python -m pip install -r requirements.txt pip install git+https://github.com/Deci-AI/super-gradients.git@${CIRCLE_BRANCH} - python tests/recipe_training_tests/cifar10_recipe_sanity_test.py + python src/super_gradients/examples/train_from_recipe_example --config-name=cifar10_resnet experiment_name=cifar10_resnet_sanity training_hyperparams.max_epochs=3 + python -m unittest /tests/recipe_training_tests/recipe_sanity_unit_test.py + workflows: release: diff --git a/tests/recipe_training_tests/cifar10_recipe_sanity_test.py b/tests/recipe_training_tests/cifar10_recipe_sanity_test.py index 2c3e919fed..5b93d94951 100644 --- a/tests/recipe_training_tests/cifar10_recipe_sanity_test.py +++ b/tests/recipe_training_tests/cifar10_recipe_sanity_test.py @@ -7,7 +7,6 @@ @hydra.main(config_path=pkg_resources.resource_filename("super_gradients.recipes", ""), config_name="cifar10_resnet", version_base="1.2") def main(cfg: DictConfig) -> None: - cfg["training_hyperparams"]["max_epochs"] = 10 Trainer.train_from_config(cfg) diff --git a/tests/recipe_training_tests/recipe_sanity_unit_test.py b/tests/recipe_training_tests/recipe_sanity_unit_test.py new file mode 100644 index 0000000000..4f412a4a2c --- /dev/null +++ b/tests/recipe_training_tests/recipe_sanity_unit_test.py @@ -0,0 +1,18 @@ +import shutil +import unittest +import os + +import torch +from super_gradients.common.environment import environment_config + + +class Cifar10RecipeSanityUnitTest(unittest.TestCase): + def test_cifar10_resnet_metric(self): + ckpt_dir = os.path.join(environment_config.PKG_CHECKPOINTS_DIR, "cifar10_resnet_sanity") + sd = torch.load(os.path.join(ckpt_dir, "ckpt_best.pth")) + shutil.rmtree(ckpt_dir) + self.assertTrue(sd["acc"].cpu().item() >= 0.75) + + +if __name__ == "__main__": + unittest.main() From a195d97635c481b7b5ac17f9b940172ba9178204 Mon Sep 17 00:00:00 2001 From: shayaharon Date: Mon, 19 Dec 2022 17:32:08 +0200 Subject: [PATCH 08/67] command fix --- .circleci/config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index b9a9e45b26..917fd127b5 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -204,7 +204,7 @@ jobs: python -m pip install wheel python -m pip install -r requirements.txt pip install git+https://github.com/Deci-AI/super-gradients.git@${CIRCLE_BRANCH} - python src/super_gradients/examples/train_from_recipe_example --config-name=cifar10_resnet experiment_name=cifar10_resnet_sanity training_hyperparams.max_epochs=3 + python src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cifar10_resnet experiment_name=cifar10_resnet_sanity training_hyperparams.max_epochs=3 python -m unittest /tests/recipe_training_tests/recipe_sanity_unit_test.py From 5829e10d52b395ae7571ee2893f003907ff89929 Mon Sep 17 00:00:00 2001 From: shayaharon Date: Tue, 20 Dec 2022 13:36:39 +0200 Subject: [PATCH 09/67] formatt --- .circleci/config.yml | 3 +- .../recipes/cifar10_resnet.yaml | 3 +- .../cifar10_recipe_sanity_test.py | 19 ----------- .../recipe_sanity_test.py | 32 +++++++++++++++++++ .../recipe_sanity_unit_test.py | 18 ----------- 5 files changed, 35 insertions(+), 40 deletions(-) delete mode 100644 tests/recipe_training_tests/cifar10_recipe_sanity_test.py create mode 100644 tests/recipe_training_tests/recipe_sanity_test.py delete mode 100644 tests/recipe_training_tests/recipe_sanity_unit_test.py diff --git a/.circleci/config.yml b/.circleci/config.yml index 917fd127b5..601ce61e3d 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -204,8 +204,7 @@ jobs: python -m pip install wheel python -m pip install -r requirements.txt pip install git+https://github.com/Deci-AI/super-gradients.git@${CIRCLE_BRANCH} - python src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cifar10_resnet experiment_name=cifar10_resnet_sanity training_hyperparams.max_epochs=3 - python -m unittest /tests/recipe_training_tests/recipe_sanity_unit_test.py + python /tests/recipe_training_tests/recipe_sanity_test.py --config-name=cifar10_resnet experiment_name=cifar10_resnet_sanity training_hyperparams.max_epochs=3 +goal_metric_val=0.9 workflows: diff --git a/src/super_gradients/recipes/cifar10_resnet.yaml b/src/super_gradients/recipes/cifar10_resnet.yaml index 9b18e916a0..89f99cf108 100644 --- a/src/super_gradients/recipes/cifar10_resnet.yaml +++ b/src/super_gradients/recipes/cifar10_resnet.yaml @@ -24,7 +24,8 @@ resume: False training_hyperparams: resume: ${resume} - +multi_gpu: DDP +num_gpus: 4 ckpt_root_dir: architecture: resnet18_cifar diff --git a/tests/recipe_training_tests/cifar10_recipe_sanity_test.py b/tests/recipe_training_tests/cifar10_recipe_sanity_test.py deleted file mode 100644 index 5b93d94951..0000000000 --- a/tests/recipe_training_tests/cifar10_recipe_sanity_test.py +++ /dev/null @@ -1,19 +0,0 @@ -from omegaconf import DictConfig -import hydra -import pkg_resources - -from super_gradients import Trainer, init_trainer - - -@hydra.main(config_path=pkg_resources.resource_filename("super_gradients.recipes", ""), config_name="cifar10_resnet", version_base="1.2") -def main(cfg: DictConfig) -> None: - Trainer.train_from_config(cfg) - - -def run(): - init_trainer() - main() - - -if __name__ == "__main__": - run() diff --git a/tests/recipe_training_tests/recipe_sanity_test.py b/tests/recipe_training_tests/recipe_sanity_test.py new file mode 100644 index 0000000000..af87529852 --- /dev/null +++ b/tests/recipe_training_tests/recipe_sanity_test.py @@ -0,0 +1,32 @@ +from coverage.annotate import os +from omegaconf import DictConfig +import hydra +import pkg_resources +from super_gradients.common.environment import environment_config +import torch +from super_gradients import Trainer, init_trainer +from super_gradients.common.environment.ddp_utils import multi_process_safe + + +@multi_process_safe +def _assert_recipe_metric(experiment_name: str, metric_value: float): + ckpt_dir = os.path.join(environment_config.PKG_CHECKPOINTS_DIR, experiment_name) + sd = torch.load(os.path.join(ckpt_dir, "ckpt_best.pth")) + assert sd["acc"].cpu().item() >= metric_value + + +@hydra.main(config_path=pkg_resources.resource_filename("super_gradients.recipes", ""), config_name="cifar10_resnet", version_base="1.2") +def main(cfg: DictConfig) -> None: + goal_metric_val = cfg["goal_metric_val"] + experiment_name = cfg["experiment_name"] + Trainer.train_from_config(cfg) + _assert_recipe_metric(experiment_name, goal_metric_val) + + +def run(): + init_trainer() + main() + + +if __name__ == "__main__": + run() diff --git a/tests/recipe_training_tests/recipe_sanity_unit_test.py b/tests/recipe_training_tests/recipe_sanity_unit_test.py deleted file mode 100644 index 4f412a4a2c..0000000000 --- a/tests/recipe_training_tests/recipe_sanity_unit_test.py +++ /dev/null @@ -1,18 +0,0 @@ -import shutil -import unittest -import os - -import torch -from super_gradients.common.environment import environment_config - - -class Cifar10RecipeSanityUnitTest(unittest.TestCase): - def test_cifar10_resnet_metric(self): - ckpt_dir = os.path.join(environment_config.PKG_CHECKPOINTS_DIR, "cifar10_resnet_sanity") - sd = torch.load(os.path.join(ckpt_dir, "ckpt_best.pth")) - shutil.rmtree(ckpt_dir) - self.assertTrue(sd["acc"].cpu().item() >= 0.75) - - -if __name__ == "__main__": - unittest.main() From 9b065d750e5af95c9f3923cc815c467075736f1e Mon Sep 17 00:00:00 2001 From: shayaharon Date: Tue, 20 Dec 2022 13:45:32 +0200 Subject: [PATCH 10/67] remove env adde to recipe+tests --- .circleci/config.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.circleci/config.yml b/.circleci/config.yml index 601ce61e3d..08bbeab981 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -206,6 +206,11 @@ jobs: pip install git+https://github.com/Deci-AI/super-gradients.git@${CIRCLE_BRANCH} python /tests/recipe_training_tests/recipe_sanity_test.py --config-name=cifar10_resnet experiment_name=cifar10_resnet_sanity training_hyperparams.max_epochs=3 +goal_metric_val=0.9 + - run: + name: Remove new environment when failed + command: "rm -r << parameters.sg_new_env_name >>" + when: on_fail + workflows: release: From 6183ee33b2d67cd342e9220996625f16d279fbda Mon Sep 17 00:00:00 2001 From: shayaharon Date: Tue, 20 Dec 2022 14:01:04 +0200 Subject: [PATCH 11/67] command fix in config --- .circleci/config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 08bbeab981..6ddbb0733a 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -204,7 +204,7 @@ jobs: python -m pip install wheel python -m pip install -r requirements.txt pip install git+https://github.com/Deci-AI/super-gradients.git@${CIRCLE_BRANCH} - python /tests/recipe_training_tests/recipe_sanity_test.py --config-name=cifar10_resnet experiment_name=cifar10_resnet_sanity training_hyperparams.max_epochs=3 +goal_metric_val=0.9 + python tests/recipe_training_tests/recipe_sanity_test.py --config-name=cifar10_resnet experiment_name=cifar10_resnet_sanity training_hyperparams.max_epochs=3 +goal_metric_val=0.9 - run: name: Remove new environment when failed From 567dcdfc17da743c2b29105cc8e3cffab6d3017d Mon Sep 17 00:00:00 2001 From: shayaharon Date: Tue, 20 Dec 2022 14:20:01 +0200 Subject: [PATCH 12/67] torchrun instead of python --- .circleci/config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 6ddbb0733a..b1eb975c41 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -204,7 +204,7 @@ jobs: python -m pip install wheel python -m pip install -r requirements.txt pip install git+https://github.com/Deci-AI/super-gradients.git@${CIRCLE_BRANCH} - python tests/recipe_training_tests/recipe_sanity_test.py --config-name=cifar10_resnet experiment_name=cifar10_resnet_sanity training_hyperparams.max_epochs=3 +goal_metric_val=0.9 + torchrun --nproc_per_node=4 tests/recipe_training_tests/recipe_sanity_test.py --config-name=cifar10_resnet experiment_name=cifar10_resnet_sanity training_hyperparams.max_epochs=3 +goal_metric_val=0.9 - run: name: Remove new environment when failed From 35a1b3828ca6e110f7f4a9f96993ce8ebfc35ab2 Mon Sep 17 00:00:00 2001 From: shayaharon Date: Tue, 20 Dec 2022 14:38:41 +0200 Subject: [PATCH 13/67] command update --- .circleci/config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index b1eb975c41..6ddbb0733a 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -204,7 +204,7 @@ jobs: python -m pip install wheel python -m pip install -r requirements.txt pip install git+https://github.com/Deci-AI/super-gradients.git@${CIRCLE_BRANCH} - torchrun --nproc_per_node=4 tests/recipe_training_tests/recipe_sanity_test.py --config-name=cifar10_resnet experiment_name=cifar10_resnet_sanity training_hyperparams.max_epochs=3 +goal_metric_val=0.9 + python tests/recipe_training_tests/recipe_sanity_test.py --config-name=cifar10_resnet experiment_name=cifar10_resnet_sanity training_hyperparams.max_epochs=3 +goal_metric_val=0.9 - run: name: Remove new environment when failed From 6d4ca534abca5e6230453410ef9ec7f599cd9c49 Mon Sep 17 00:00:00 2001 From: shayaharon Date: Tue, 20 Dec 2022 14:51:12 +0200 Subject: [PATCH 14/67] hydra full error env var --- .circleci/config.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.circleci/config.yml b/.circleci/config.yml index 6ddbb0733a..692e33ba49 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -204,6 +204,7 @@ jobs: python -m pip install wheel python -m pip install -r requirements.txt pip install git+https://github.com/Deci-AI/super-gradients.git@${CIRCLE_BRANCH} + export HYDRA_FULL_ERROR=1 python tests/recipe_training_tests/recipe_sanity_test.py --config-name=cifar10_resnet experiment_name=cifar10_resnet_sanity training_hyperparams.max_epochs=3 +goal_metric_val=0.9 - run: From 6e7a7f01a7e0979ba9cf549a171faa4a6e71685a Mon Sep 17 00:00:00 2001 From: shayaharon Date: Tue, 20 Dec 2022 14:56:59 +0200 Subject: [PATCH 15/67] train from recipe cmd --- .circleci/config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 692e33ba49..644bb0593f 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -205,7 +205,7 @@ jobs: python -m pip install -r requirements.txt pip install git+https://github.com/Deci-AI/super-gradients.git@${CIRCLE_BRANCH} export HYDRA_FULL_ERROR=1 - python tests/recipe_training_tests/recipe_sanity_test.py --config-name=cifar10_resnet experiment_name=cifar10_resnet_sanity training_hyperparams.max_epochs=3 +goal_metric_val=0.9 + python src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cifar10_resnet experiment_name=cifar10_resnet_sanity training_hyperparams.max_epochs=3 - run: name: Remove new environment when failed From d08430982baa595562f283331021d67fcc053e10 Mon Sep 17 00:00:00 2001 From: shayaharon Date: Tue, 20 Dec 2022 15:08:07 +0200 Subject: [PATCH 16/67] torch installation fix --- .circleci/config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 644bb0593f..c5d3691795 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -204,7 +204,7 @@ jobs: python -m pip install wheel python -m pip install -r requirements.txt pip install git+https://github.com/Deci-AI/super-gradients.git@${CIRCLE_BRANCH} - export HYDRA_FULL_ERROR=1 + pip install torch==1.11.0+cu113 torchvision==0.12.0+cu113 --extra-index-url https://download.pytorch.org/whl/cu113 python src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cifar10_resnet experiment_name=cifar10_resnet_sanity training_hyperparams.max_epochs=3 - run: From e06eabee0e1153f6d30d26a20d9cde6278f13695 Mon Sep 17 00:00:00 2001 From: shayaharon Date: Thu, 22 Dec 2022 10:52:46 +0200 Subject: [PATCH 17/67] protobuf version try --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 92f2852605..1c5b1ebcf8 100755 --- a/requirements.txt +++ b/requirements.txt @@ -23,7 +23,7 @@ pip-tools>=6.4.0 pyparsing==2.4.5 einops==0.3.2 pycocotools==2.0.4 -protobuf~=3.19.0 +protobuf==3.20.3 treelib==1.6.1 termcolor==1.1.0 packaging>=20.4 From f5c6a1145ab30e714f163b407b4ec35ccc0d911e Mon Sep 17 00:00:00 2001 From: shayaharon Date: Thu, 22 Dec 2022 15:37:30 +0200 Subject: [PATCH 18/67] lets get this running --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 1c5b1ebcf8..92f2852605 100755 --- a/requirements.txt +++ b/requirements.txt @@ -23,7 +23,7 @@ pip-tools>=6.4.0 pyparsing==2.4.5 einops==0.3.2 pycocotools==2.0.4 -protobuf==3.20.3 +protobuf~=3.19.0 treelib==1.6.1 termcolor==1.1.0 packaging>=20.4 From 1ffd2bb81ac18bf828778803d43665c1c01a5ed4 Mon Sep 17 00:00:00 2001 From: shayaharon Date: Thu, 22 Dec 2022 15:54:30 +0200 Subject: [PATCH 19/67] lets get this working --- .circleci/config.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index c5d3691795..7389d2732f 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -204,7 +204,6 @@ jobs: python -m pip install wheel python -m pip install -r requirements.txt pip install git+https://github.com/Deci-AI/super-gradients.git@${CIRCLE_BRANCH} - pip install torch==1.11.0+cu113 torchvision==0.12.0+cu113 --extra-index-url https://download.pytorch.org/whl/cu113 python src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cifar10_resnet experiment_name=cifar10_resnet_sanity training_hyperparams.max_epochs=3 - run: From 422de1f786e2f5f40b4044270771de8e706ee1cd Mon Sep 17 00:00:00 2001 From: shayaharon Date: Thu, 22 Dec 2022 16:04:53 +0200 Subject: [PATCH 20/67] let make this work2 --- .circleci/config.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.circleci/config.yml b/.circleci/config.yml index 7389d2732f..516103a69d 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -203,6 +203,7 @@ jobs: python -m pip install --upgrade setuptools python -m pip install wheel python -m pip install -r requirements.txt + pip install torch==1.11.0+cu113 torchvision==0.12.0+cu113 --extra-index-url https://download.pytorch.org/whl/cu113 pip install git+https://github.com/Deci-AI/super-gradients.git@${CIRCLE_BRANCH} python src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cifar10_resnet experiment_name=cifar10_resnet_sanity training_hyperparams.max_epochs=3 From 27f866bc99e8b5ecffdce058fa1c16769045744f Mon Sep 17 00:00:00 2001 From: shayaharon Date: Thu, 22 Dec 2022 16:13:11 +0200 Subject: [PATCH 21/67] lets make this work 3.0 --- .circleci/config.yml | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 516103a69d..14f390981e 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -199,13 +199,11 @@ jobs: command: | << parameters.sg_new_env_python_version >> -m venv << parameters.sg_new_env_name >> source << parameters.sg_new_env_name >>/bin/activate - python --version - python -m pip install --upgrade setuptools - python -m pip install wheel - python -m pip install -r requirements.txt - pip install torch==1.11.0+cu113 torchvision==0.12.0+cu113 --extra-index-url https://download.pytorch.org/whl/cu113 - pip install git+https://github.com/Deci-AI/super-gradients.git@${CIRCLE_BRANCH} - python src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cifar10_resnet experiment_name=cifar10_resnet_sanity training_hyperparams.max_epochs=3 + python3.8 -m pip install --upgrade setuptools pip wheel + python3.8 -m pip install -r requirements.txt + python3.8 -m pip install git+https://github.com/Deci-AI/super-gradients.git@${CIRCLE_BRANCH} + python3.8 -m pip install torch==1.11.0+cu113 torchvision==0.12.0+cu113 --extra-index-url https://download.pytorch.org/whl/cu113 + python3.8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cifar10_resnet experiment_name=cifar10_resnet_sanity training_hyperparams.max_epochs=3 - run: name: Remove new environment when failed From 6adf90f54a3d0cb112d6fba9393d01a5861441b9 Mon Sep 17 00:00:00 2001 From: shayaharon Date: Sun, 25 Dec 2022 10:04:41 +0200 Subject: [PATCH 22/67] let make this work 4.0 --- .circleci/config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 14f390981e..ba9ee1f555 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -203,7 +203,7 @@ jobs: python3.8 -m pip install -r requirements.txt python3.8 -m pip install git+https://github.com/Deci-AI/super-gradients.git@${CIRCLE_BRANCH} python3.8 -m pip install torch==1.11.0+cu113 torchvision==0.12.0+cu113 --extra-index-url https://download.pytorch.org/whl/cu113 - python3.8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cifar10_resnet experiment_name=cifar10_resnet_sanity training_hyperparams.max_epochs=3 + python3.8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cifar10_resnet experiment_name=cifar10_resnet_sanity training_hyperparams.max_epochs=2 - run: name: Remove new environment when failed From ea7c9a5621044883e55b58dd245feb707d13ee60 Mon Sep 17 00:00:00 2001 From: shayaharon Date: Sun, 25 Dec 2022 11:45:43 +0200 Subject: [PATCH 23/67] lets make this work 5.0 --- .circleci/config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index ba9ee1f555..14f390981e 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -203,7 +203,7 @@ jobs: python3.8 -m pip install -r requirements.txt python3.8 -m pip install git+https://github.com/Deci-AI/super-gradients.git@${CIRCLE_BRANCH} python3.8 -m pip install torch==1.11.0+cu113 torchvision==0.12.0+cu113 --extra-index-url https://download.pytorch.org/whl/cu113 - python3.8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cifar10_resnet experiment_name=cifar10_resnet_sanity training_hyperparams.max_epochs=2 + python3.8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cifar10_resnet experiment_name=cifar10_resnet_sanity training_hyperparams.max_epochs=3 - run: name: Remove new environment when failed From f63bd6070740cad0d33e2918de847ac735b6436a Mon Sep 17 00:00:00 2001 From: shayaharon Date: Sun, 25 Dec 2022 12:43:59 +0200 Subject: [PATCH 24/67] coco try --- .circleci/config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 14f390981e..800da4c988 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -203,7 +203,7 @@ jobs: python3.8 -m pip install -r requirements.txt python3.8 -m pip install git+https://github.com/Deci-AI/super-gradients.git@${CIRCLE_BRANCH} python3.8 -m pip install torch==1.11.0+cu113 torchvision==0.12.0+cu113 --extra-index-url https://download.pytorch.org/whl/cu113 - python3.8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cifar10_resnet experiment_name=cifar10_resnet_sanity training_hyperparams.max_epochs=3 + python3.8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=coco2017_ssd_lite_mobilenet_v2 anchors_name=stride_16_plus experiment_name=coco_resnet_sanity training_hyperparams.max_epochs=2 - run: name: Remove new environment when failed From 04d952849adc1c34b0fa2937288a71efea42d706 Mon Sep 17 00:00:00 2001 From: shayaharon Date: Sun, 25 Dec 2022 12:49:06 +0200 Subject: [PATCH 25/67] coco try yolox --- .circleci/config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 800da4c988..8c5d33c7b5 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -203,7 +203,7 @@ jobs: python3.8 -m pip install -r requirements.txt python3.8 -m pip install git+https://github.com/Deci-AI/super-gradients.git@${CIRCLE_BRANCH} python3.8 -m pip install torch==1.11.0+cu113 torchvision==0.12.0+cu113 --extra-index-url https://download.pytorch.org/whl/cu113 - python3.8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=coco2017_ssd_lite_mobilenet_v2 anchors_name=stride_16_plus experiment_name=coco_resnet_sanity training_hyperparams.max_epochs=2 + python3.8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=coco2017_yolox architecture=yolox_n experiment_name=coco_resnet_sanity training_hyperparams.max_epochs=2 - run: name: Remove new environment when failed From 38cd95fb42ded8aceb06d0c8e4b6227bab1353ca Mon Sep 17 00:00:00 2001 From: shayaharon Date: Sun, 25 Dec 2022 12:54:25 +0200 Subject: [PATCH 26/67] coco try yolox fix num gpus --- src/super_gradients/recipes/coco2017_yolox.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/super_gradients/recipes/coco2017_yolox.yaml b/src/super_gradients/recipes/coco2017_yolox.yaml index b520bdf0ed..3d4b9abdad 100644 --- a/src/super_gradients/recipes/coco2017_yolox.yaml +++ b/src/super_gradients/recipes/coco2017_yolox.yaml @@ -51,7 +51,7 @@ training_hyperparams: architecture: yolox_s multi_gpu: DDP -num_gpus: 8 +num_gpus: 4 experiment_suffix: res${dataset_params.train_dataset_params.input_dim} experiment_name: ${architecture}_coco2017_${experiment_suffix} From 92931e76eb063ebfc1c28750c801064bedda45e4 Mon Sep 17 00:00:00 2001 From: shayaharon Date: Sun, 25 Dec 2022 13:21:33 +0200 Subject: [PATCH 27/67] reordr installs --- .circleci/config.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 8c5d33c7b5..721fbe9172 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -200,10 +200,10 @@ jobs: << parameters.sg_new_env_python_version >> -m venv << parameters.sg_new_env_name >> source << parameters.sg_new_env_name >>/bin/activate python3.8 -m pip install --upgrade setuptools pip wheel + python3.8 -m pip install torch==1.11.0+cu113 torchvision==0.12.0+cu113 --extra-index-url https://download.pytorch.org/whl/cu113 python3.8 -m pip install -r requirements.txt python3.8 -m pip install git+https://github.com/Deci-AI/super-gradients.git@${CIRCLE_BRANCH} - python3.8 -m pip install torch==1.11.0+cu113 torchvision==0.12.0+cu113 --extra-index-url https://download.pytorch.org/whl/cu113 - python3.8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=coco2017_yolox architecture=yolox_n experiment_name=coco_resnet_sanity training_hyperparams.max_epochs=2 + python3.8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cifar10_resnet experiment_name=cifar10_resnet_sanity training_hyperparams.max_epochs=3 - run: name: Remove new environment when failed From 4e2fca251e5242951c00fd280c1504d2d619a89f Mon Sep 17 00:00:00 2001 From: shayaharon Date: Sun, 25 Dec 2022 13:31:58 +0200 Subject: [PATCH 28/67] order installs + python3.8 removed --- .circleci/config.yml | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 721fbe9172..6197c2c8f6 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -199,11 +199,13 @@ jobs: command: | << parameters.sg_new_env_python_version >> -m venv << parameters.sg_new_env_name >> source << parameters.sg_new_env_name >>/bin/activate - python3.8 -m pip install --upgrade setuptools pip wheel - python3.8 -m pip install torch==1.11.0+cu113 torchvision==0.12.0+cu113 --extra-index-url https://download.pytorch.org/whl/cu113 - python3.8 -m pip install -r requirements.txt - python3.8 -m pip install git+https://github.com/Deci-AI/super-gradients.git@${CIRCLE_BRANCH} - python3.8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cifar10_resnet experiment_name=cifar10_resnet_sanity training_hyperparams.max_epochs=3 + python --version + python -m pip install --upgrade setuptools + python -m pip install wheel + python -m pip install -r requirements.txt + python pip install git+https://github.com/Deci-AI/super-gradients.git@${CIRCLE_BRANCH} + python pip install torch==1.11.0+cu113 torchvision==0.12.0+cu113 --extra-index-url https://download.pytorch.org/whl/cu113 + python src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cifar10_resnet experiment_name=cifar10_resnet_sanity training_hyperparams.max_epochs=3 - run: name: Remove new environment when failed From ee3f7bd742df48ee1c4e7567491b2e65f5066b84 Mon Sep 17 00:00:00 2001 From: shayaharon Date: Sun, 25 Dec 2022 13:39:03 +0200 Subject: [PATCH 29/67] order installs + python3.8 --- .circleci/config.yml | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 6197c2c8f6..14f390981e 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -199,13 +199,11 @@ jobs: command: | << parameters.sg_new_env_python_version >> -m venv << parameters.sg_new_env_name >> source << parameters.sg_new_env_name >>/bin/activate - python --version - python -m pip install --upgrade setuptools - python -m pip install wheel - python -m pip install -r requirements.txt - python pip install git+https://github.com/Deci-AI/super-gradients.git@${CIRCLE_BRANCH} - python pip install torch==1.11.0+cu113 torchvision==0.12.0+cu113 --extra-index-url https://download.pytorch.org/whl/cu113 - python src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cifar10_resnet experiment_name=cifar10_resnet_sanity training_hyperparams.max_epochs=3 + python3.8 -m pip install --upgrade setuptools pip wheel + python3.8 -m pip install -r requirements.txt + python3.8 -m pip install git+https://github.com/Deci-AI/super-gradients.git@${CIRCLE_BRANCH} + python3.8 -m pip install torch==1.11.0+cu113 torchvision==0.12.0+cu113 --extra-index-url https://download.pytorch.org/whl/cu113 + python3.8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cifar10_resnet experiment_name=cifar10_resnet_sanity training_hyperparams.max_epochs=3 - run: name: Remove new environment when failed From 9b79d9c56a783754d379a577b5adbf38bcc5b16d Mon Sep 17 00:00:00 2001 From: shayaharon Date: Sun, 25 Dec 2022 14:25:24 +0200 Subject: [PATCH 30/67] torch 1.12 --- .circleci/config.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 14f390981e..69569ac5f8 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -202,7 +202,7 @@ jobs: python3.8 -m pip install --upgrade setuptools pip wheel python3.8 -m pip install -r requirements.txt python3.8 -m pip install git+https://github.com/Deci-AI/super-gradients.git@${CIRCLE_BRANCH} - python3.8 -m pip install torch==1.11.0+cu113 torchvision==0.12.0+cu113 --extra-index-url https://download.pytorch.org/whl/cu113 + python3.8 -m pip install torch==1.12.0+cu113 torchvision==0.13.0+cu113 torchaudio==0.12.0 --extra-index-url https://download.pytorch.org/whl/cu113 python3.8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cifar10_resnet experiment_name=cifar10_resnet_sanity training_hyperparams.max_epochs=3 - run: @@ -211,6 +211,7 @@ jobs: when: on_fail + workflows: release: jobs: From 4db4ff253cc4c469f42ecfeafb4b6e3d6e5e3b11 Mon Sep 17 00:00:00 2001 From: shayaharon Date: Sun, 25 Dec 2022 14:32:06 +0200 Subject: [PATCH 31/67] linter --- .../training/dataloaders/dataloaders.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/src/super_gradients/training/dataloaders/dataloaders.py b/src/super_gradients/training/dataloaders/dataloaders.py index 2c8dd5512f..cd345de6d1 100644 --- a/src/super_gradients/training/dataloaders/dataloaders.py +++ b/src/super_gradients/training/dataloaders/dataloaders.py @@ -32,10 +32,7 @@ SuperviselyPersonsDataset, ) from super_gradients.common.factories.samplers_factory import SamplersFactory -from super_gradients.training.utils.distributed_training_utils import ( - wait_for_the_master, - get_local_rank, -) + from super_gradients.common.abstractions.abstract_logger import get_logger from super_gradients.training.utils.utils import override_default_params_without_nones from super_gradients.common.factories.datasets_factory import DatasetsFactory @@ -72,11 +69,9 @@ def get_data_loader(config_name, dataset_cls, train, dataset_params=None, datalo dataset_params = _process_dataset_params(cfg, dataset_params, train) - local_rank = get_local_rank() - with wait_for_the_master(local_rank): - dataset = dataset_cls(**dataset_params) - if not hasattr(dataset, "dataset_params"): - dataset.dataset_params = dataset_params + dataset = dataset_cls(**dataset_params) + if not hasattr(dataset, "dataset_params"): + dataset.dataset_params = dataset_params dataloader_params = _process_dataloader_params(cfg, dataloader_params, dataset, train) From 6f7384c6e26346a08d6720842d0e91afd0895c0a Mon Sep 17 00:00:00 2001 From: shayaharon Date: Sun, 25 Dec 2022 16:28:37 +0200 Subject: [PATCH 32/67] cleanup and 11.6 --- .circleci/config.yml | 2 +- src/super_gradients/recipes/coco2017_yolox.yaml | 2 +- .../training/dataloaders/dataloaders.py | 13 +++++++++---- 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 69569ac5f8..147c8d71d4 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -202,7 +202,7 @@ jobs: python3.8 -m pip install --upgrade setuptools pip wheel python3.8 -m pip install -r requirements.txt python3.8 -m pip install git+https://github.com/Deci-AI/super-gradients.git@${CIRCLE_BRANCH} - python3.8 -m pip install torch==1.12.0+cu113 torchvision==0.13.0+cu113 torchaudio==0.12.0 --extra-index-url https://download.pytorch.org/whl/cu113 + python3.8 -m pip install torch==1.12.0+cu116 torchvision==0.13.0+cu116 torchaudio==0.12.0 --extra-index-url https://download.pytorch.org/whl/cu116 python3.8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cifar10_resnet experiment_name=cifar10_resnet_sanity training_hyperparams.max_epochs=3 - run: diff --git a/src/super_gradients/recipes/coco2017_yolox.yaml b/src/super_gradients/recipes/coco2017_yolox.yaml index 3d4b9abdad..b520bdf0ed 100644 --- a/src/super_gradients/recipes/coco2017_yolox.yaml +++ b/src/super_gradients/recipes/coco2017_yolox.yaml @@ -51,7 +51,7 @@ training_hyperparams: architecture: yolox_s multi_gpu: DDP -num_gpus: 4 +num_gpus: 8 experiment_suffix: res${dataset_params.train_dataset_params.input_dim} experiment_name: ${architecture}_coco2017_${experiment_suffix} diff --git a/src/super_gradients/training/dataloaders/dataloaders.py b/src/super_gradients/training/dataloaders/dataloaders.py index cd345de6d1..2c8dd5512f 100644 --- a/src/super_gradients/training/dataloaders/dataloaders.py +++ b/src/super_gradients/training/dataloaders/dataloaders.py @@ -32,7 +32,10 @@ SuperviselyPersonsDataset, ) from super_gradients.common.factories.samplers_factory import SamplersFactory - +from super_gradients.training.utils.distributed_training_utils import ( + wait_for_the_master, + get_local_rank, +) from super_gradients.common.abstractions.abstract_logger import get_logger from super_gradients.training.utils.utils import override_default_params_without_nones from super_gradients.common.factories.datasets_factory import DatasetsFactory @@ -69,9 +72,11 @@ def get_data_loader(config_name, dataset_cls, train, dataset_params=None, datalo dataset_params = _process_dataset_params(cfg, dataset_params, train) - dataset = dataset_cls(**dataset_params) - if not hasattr(dataset, "dataset_params"): - dataset.dataset_params = dataset_params + local_rank = get_local_rank() + with wait_for_the_master(local_rank): + dataset = dataset_cls(**dataset_params) + if not hasattr(dataset, "dataset_params"): + dataset.dataset_params = dataset_params dataloader_params = _process_dataloader_params(cfg, dataloader_params, dataset, train) From c39a9c1144f5852382d5d0c2144ae00251279382 Mon Sep 17 00:00:00 2001 From: shayaharon Date: Sun, 25 Dec 2022 16:42:41 +0200 Subject: [PATCH 33/67] 11.6 with 2 epochs --- .circleci/config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 147c8d71d4..9b187e3544 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -203,7 +203,7 @@ jobs: python3.8 -m pip install -r requirements.txt python3.8 -m pip install git+https://github.com/Deci-AI/super-gradients.git@${CIRCLE_BRANCH} python3.8 -m pip install torch==1.12.0+cu116 torchvision==0.13.0+cu116 torchaudio==0.12.0 --extra-index-url https://download.pytorch.org/whl/cu116 - python3.8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cifar10_resnet experiment_name=cifar10_resnet_sanity training_hyperparams.max_epochs=3 + python3.8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cifar10_resnet experiment_name=cifar10_resnet_sanity training_hyperparams.max_epochs=2 - run: name: Remove new environment when failed From 9cf0e096c3be6fc1d2c8cd21f1512c0e9e5795d9 Mon Sep 17 00:00:00 2001 From: shayaharon Date: Sun, 25 Dec 2022 18:42:53 +0200 Subject: [PATCH 34/67] dist launch used --- .circleci/config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 9b187e3544..3fe909a7ec 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -203,7 +203,7 @@ jobs: python3.8 -m pip install -r requirements.txt python3.8 -m pip install git+https://github.com/Deci-AI/super-gradients.git@${CIRCLE_BRANCH} python3.8 -m pip install torch==1.12.0+cu116 torchvision==0.13.0+cu116 torchaudio==0.12.0 --extra-index-url https://download.pytorch.org/whl/cu116 - python3.8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cifar10_resnet experiment_name=cifar10_resnet_sanity training_hyperparams.max_epochs=2 + python3.8 -m torch.distributed.launch --nproc_per_node=4 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cifar10_resnet experiment_name=cifar10_resnet_sanity training_hyperparams.max_epochs=2 - run: name: Remove new environment when failed From b6b0eab44ec6cbedc401628e66690fa3732ff255 Mon Sep 17 00:00:00 2001 From: shayaharon Date: Sun, 25 Dec 2022 18:48:47 +0200 Subject: [PATCH 35/67] dataset params lines removed --- src/super_gradients/training/dataloaders/dataloaders.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/super_gradients/training/dataloaders/dataloaders.py b/src/super_gradients/training/dataloaders/dataloaders.py index 2c8dd5512f..b0a0fdfe01 100644 --- a/src/super_gradients/training/dataloaders/dataloaders.py +++ b/src/super_gradients/training/dataloaders/dataloaders.py @@ -75,8 +75,8 @@ def get_data_loader(config_name, dataset_cls, train, dataset_params=None, datalo local_rank = get_local_rank() with wait_for_the_master(local_rank): dataset = dataset_cls(**dataset_params) - if not hasattr(dataset, "dataset_params"): - dataset.dataset_params = dataset_params + # if not hasattr(dataset, "dataset_params"): + # dataset.dataset_params = dataset_params dataloader_params = _process_dataloader_params(cfg, dataloader_params, dataset, train) From d35cc2f7831df9979914d19f24eb080bc89ba396 Mon Sep 17 00:00:00 2001 From: shayaharon Date: Sun, 25 Dec 2022 21:26:09 +0200 Subject: [PATCH 36/67] nccl debug --- .circleci/config.yml | 1 + src/super_gradients/training/dataloaders/dataloaders.py | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 3fe909a7ec..9dc0123777 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -203,6 +203,7 @@ jobs: python3.8 -m pip install -r requirements.txt python3.8 -m pip install git+https://github.com/Deci-AI/super-gradients.git@${CIRCLE_BRANCH} python3.8 -m pip install torch==1.12.0+cu116 torchvision==0.13.0+cu116 torchaudio==0.12.0 --extra-index-url https://download.pytorch.org/whl/cu116 + export NCCL_DEBUG=INFO python3.8 -m torch.distributed.launch --nproc_per_node=4 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cifar10_resnet experiment_name=cifar10_resnet_sanity training_hyperparams.max_epochs=2 - run: diff --git a/src/super_gradients/training/dataloaders/dataloaders.py b/src/super_gradients/training/dataloaders/dataloaders.py index b0a0fdfe01..2c8dd5512f 100644 --- a/src/super_gradients/training/dataloaders/dataloaders.py +++ b/src/super_gradients/training/dataloaders/dataloaders.py @@ -75,8 +75,8 @@ def get_data_loader(config_name, dataset_cls, train, dataset_params=None, datalo local_rank = get_local_rank() with wait_for_the_master(local_rank): dataset = dataset_cls(**dataset_params) - # if not hasattr(dataset, "dataset_params"): - # dataset.dataset_params = dataset_params + if not hasattr(dataset, "dataset_params"): + dataset.dataset_params = dataset_params dataloader_params = _process_dataloader_params(cfg, dataloader_params, dataset, train) From 56ccad18354c9e984223d38b94e0bd6eef01806e Mon Sep 17 00:00:00 2001 From: shayaharon Date: Mon, 26 Dec 2022 11:28:11 +0200 Subject: [PATCH 37/67] assert with abs, cifar rolled back --- .circleci/config.yml | 4 ++-- tests/recipe_training_tests/recipe_sanity_test.py | 7 ++++--- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 9dc0123777..cc8e832e27 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -203,8 +203,8 @@ jobs: python3.8 -m pip install -r requirements.txt python3.8 -m pip install git+https://github.com/Deci-AI/super-gradients.git@${CIRCLE_BRANCH} python3.8 -m pip install torch==1.12.0+cu116 torchvision==0.13.0+cu116 torchaudio==0.12.0 --extra-index-url https://download.pytorch.org/whl/cu116 - export NCCL_DEBUG=INFO - python3.8 -m torch.distributed.launch --nproc_per_node=4 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cifar10_resnet experiment_name=cifar10_resnet_sanity training_hyperparams.max_epochs=2 + python3.8 tests/recipe_training_tests/recipe_sanity_test.py --config-name=cifar10_resnet training_hyperparams.max_epochs=20 training_hyperparams.average_best_models=False +multi_gpu=DDP +num_gpus=4 +goal_metric_val=0.813 +delta=0.02 + - run: name: Remove new environment when failed diff --git a/tests/recipe_training_tests/recipe_sanity_test.py b/tests/recipe_training_tests/recipe_sanity_test.py index af87529852..7aec89999b 100644 --- a/tests/recipe_training_tests/recipe_sanity_test.py +++ b/tests/recipe_training_tests/recipe_sanity_test.py @@ -9,18 +9,19 @@ @multi_process_safe -def _assert_recipe_metric(experiment_name: str, metric_value: float): +def _assert_recipe_metric(experiment_name: str, metric_value: float, delta: float): ckpt_dir = os.path.join(environment_config.PKG_CHECKPOINTS_DIR, experiment_name) sd = torch.load(os.path.join(ckpt_dir, "ckpt_best.pth")) - assert sd["acc"].cpu().item() >= metric_value + assert abs(sd["acc"].cpu().item() - metric_value) <= delta @hydra.main(config_path=pkg_resources.resource_filename("super_gradients.recipes", ""), config_name="cifar10_resnet", version_base="1.2") def main(cfg: DictConfig) -> None: goal_metric_val = cfg["goal_metric_val"] experiment_name = cfg["experiment_name"] + delta = cfg["delta"] Trainer.train_from_config(cfg) - _assert_recipe_metric(experiment_name, goal_metric_val) + _assert_recipe_metric(experiment_name, goal_metric_val, delta) def run(): From a8a8e2b64d844d8b36668d143e403fc57eaec939 Mon Sep 17 00:00:00 2001 From: shayaharon Date: Mon, 26 Dec 2022 11:28:31 +0200 Subject: [PATCH 38/67] cifar recipe fix --- src/super_gradients/recipes/cifar10_resnet.yaml | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/super_gradients/recipes/cifar10_resnet.yaml b/src/super_gradients/recipes/cifar10_resnet.yaml index 89f99cf108..7954046c8d 100644 --- a/src/super_gradients/recipes/cifar10_resnet.yaml +++ b/src/super_gradients/recipes/cifar10_resnet.yaml @@ -24,8 +24,6 @@ resume: False training_hyperparams: resume: ${resume} -multi_gpu: DDP -num_gpus: 4 ckpt_root_dir: architecture: resnet18_cifar From be0acec4e69a524dc70e81785382c2f39af2c96d Mon Sep 17 00:00:00 2001 From: shayaharon Date: Mon, 26 Dec 2022 11:58:05 +0200 Subject: [PATCH 39/67] formatting --- src/super_gradients/training/sg_trainer/sg_trainer.py | 8 +++++++- tests/recipe_training_tests/recipe_sanity_test.py | 10 +++++++++- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/src/super_gradients/training/sg_trainer/sg_trainer.py b/src/super_gradients/training/sg_trainer/sg_trainer.py index a17ca34fe5..2299e223d3 100755 --- a/src/super_gradients/training/sg_trainer/sg_trainer.py +++ b/src/super_gradients/training/sg_trainer/sg_trainer.py @@ -954,7 +954,13 @@ def forward(self, inputs, targets): training_params = dict() self.train_loader = train_loader or self.train_loader self.valid_loader = valid_loader or self.valid_loader - if len(self.train_loader.dataset) % self.train_loader.batch_size != 0 and not self.train_loader.drop_last: + + if hasattr(self.train_loader, "batch_sampler") and self.train_loader.batch_sampler is not None: + batch_size = self.train_loader.batch_sampler.batch_size + else: + batch_size = self.train_loader.batch_size + + if len(self.train_loader.dataset) % batch_size != 0 and not self.train_loader.drop_last: logger.warning("Train dataset size % batch_size != 0 and drop_last=False, this might result in smaller " "last batch.") self._set_dataset_params() diff --git a/tests/recipe_training_tests/recipe_sanity_test.py b/tests/recipe_training_tests/recipe_sanity_test.py index 7aec89999b..13f1e18be3 100644 --- a/tests/recipe_training_tests/recipe_sanity_test.py +++ b/tests/recipe_training_tests/recipe_sanity_test.py @@ -6,13 +6,21 @@ import torch from super_gradients import Trainer, init_trainer from super_gradients.common.environment.ddp_utils import multi_process_safe +from super_gradients.common.abstractions.abstract_logger import get_logger + +logger = get_logger(__name__) @multi_process_safe def _assert_recipe_metric(experiment_name: str, metric_value: float, delta: float): ckpt_dir = os.path.join(environment_config.PKG_CHECKPOINTS_DIR, experiment_name) sd = torch.load(os.path.join(ckpt_dir, "ckpt_best.pth")) - assert abs(sd["acc"].cpu().item() - metric_value) <= delta + metric_val_reached = sd["acc"].cpu().item() + diff = abs(metric_val_reached - metric_value) + logger.info( + "Goal metric value: " + str(metric_value) + ", metric value reached: " + str(metric_val_reached) + ",diff: " + str(diff) + ", delta: " + str(delta) + ) + assert diff <= delta @hydra.main(config_path=pkg_resources.resource_filename("super_gradients.recipes", ""), config_name="cifar10_resnet", version_base="1.2") From ea816dcf7ad75d0cc5bac211820aaeb489639442 Mon Sep 17 00:00:00 2001 From: shayaharon Date: Mon, 26 Dec 2022 12:12:59 +0200 Subject: [PATCH 40/67] formatter --- tests/recipe_training_tests/recipe_sanity_test.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/tests/recipe_training_tests/recipe_sanity_test.py b/tests/recipe_training_tests/recipe_sanity_test.py index 13f1e18be3..b32b6cbc53 100644 --- a/tests/recipe_training_tests/recipe_sanity_test.py +++ b/tests/recipe_training_tests/recipe_sanity_test.py @@ -7,20 +7,28 @@ from super_gradients import Trainer, init_trainer from super_gradients.common.environment.ddp_utils import multi_process_safe from super_gradients.common.abstractions.abstract_logger import get_logger +from super_gradients.training.utils.distributed_training_utils import wait_for_the_master, get_local_rank logger = get_logger(__name__) -@multi_process_safe def _assert_recipe_metric(experiment_name: str, metric_value: float, delta: float): ckpt_dir = os.path.join(environment_config.PKG_CHECKPOINTS_DIR, experiment_name) sd = torch.load(os.path.join(ckpt_dir, "ckpt_best.pth")) metric_val_reached = sd["acc"].cpu().item() diff = abs(metric_val_reached - metric_value) + _print_test_result(delta, diff, metric_val_reached, metric_value) + if diff <= delta: + exit(0) + else: + exit(1) + + +@multi_process_safe +def _print_test_result(delta, diff, metric_val_reached, metric_value): logger.info( "Goal metric value: " + str(metric_value) + ", metric value reached: " + str(metric_val_reached) + ",diff: " + str(diff) + ", delta: " + str(delta) ) - assert diff <= delta @hydra.main(config_path=pkg_resources.resource_filename("super_gradients.recipes", ""), config_name="cifar10_resnet", version_base="1.2") @@ -29,7 +37,8 @@ def main(cfg: DictConfig) -> None: experiment_name = cfg["experiment_name"] delta = cfg["delta"] Trainer.train_from_config(cfg) - _assert_recipe_metric(experiment_name, goal_metric_val, delta) + with wait_for_the_master(get_local_rank()): + _assert_recipe_metric(experiment_name, goal_metric_val, delta) def run(): From 179d6fd3ced6925ea43352277362eae0f02d8a06 Mon Sep 17 00:00:00 2001 From: shayaharon Date: Mon, 26 Dec 2022 14:44:03 +0200 Subject: [PATCH 41/67] teardown added to test + seg and det tests added --- .circleci/config.yml | 5 ++++- tests/recipe_training_tests/recipe_sanity_test.py | 9 +++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index cc8e832e27..7df18c39be 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -203,7 +203,10 @@ jobs: python3.8 -m pip install -r requirements.txt python3.8 -m pip install git+https://github.com/Deci-AI/super-gradients.git@${CIRCLE_BRANCH} python3.8 -m pip install torch==1.12.0+cu116 torchvision==0.13.0+cu116 torchaudio==0.12.0 --extra-index-url https://download.pytorch.org/whl/cu116 - python3.8 tests/recipe_training_tests/recipe_sanity_test.py --config-name=cifar10_resnet training_hyperparams.max_epochs=20 training_hyperparams.average_best_models=False +multi_gpu=DDP +num_gpus=4 +goal_metric_val=0.813 +delta=0.02 + python3.8 tests/recipe_training_tests/recipe_sanity_test.py --config-name=coco2017_yolox architecture=yolox_n training_hyperparams.max_epochs=10 training_hyperparams.average_best_models=False training_hyperparams.loss=yolox_fast_loss num_gpus=4 +goal_metric_val=0.075 +delta=0.02 + python3.8 tests/recipe_training_tests/recipe_sanity_test.py --config-name=cityscapes_regseg48 training_hyperparams.max_epochs=10 +goal_metric_val=0.263 +delta=0.025 + python3.8 tests/recipe_training_tests/recipe_sanity_test.py --config-name=cifar10_resnet training_hyperparams.max_epochs=100 training_hyperparams.average_best_models=False +multi_gpu=DDP +num_gpus=4 +goal_metric_val=0.89 +delta=0.02 + - run: diff --git a/tests/recipe_training_tests/recipe_sanity_test.py b/tests/recipe_training_tests/recipe_sanity_test.py index b32b6cbc53..dcbc6315d7 100644 --- a/tests/recipe_training_tests/recipe_sanity_test.py +++ b/tests/recipe_training_tests/recipe_sanity_test.py @@ -1,3 +1,5 @@ +import shutil + from coverage.annotate import os from omegaconf import DictConfig import hydra @@ -18,6 +20,7 @@ def _assert_recipe_metric(experiment_name: str, metric_value: float, delta: floa metric_val_reached = sd["acc"].cpu().item() diff = abs(metric_val_reached - metric_value) _print_test_result(delta, diff, metric_val_reached, metric_value) + _tear_down(ckpt_dir) if diff <= delta: exit(0) else: @@ -31,6 +34,12 @@ def _print_test_result(delta, diff, metric_val_reached, metric_value): ) +@multi_process_safe +def _tear_down(ckpt_dir): + if os.path.exists(ckpt_dir): + shutil.rmtree(ckpt_dir) + + @hydra.main(config_path=pkg_resources.resource_filename("super_gradients.recipes", ""), config_name="cifar10_resnet", version_base="1.2") def main(cfg: DictConfig) -> None: goal_metric_val = cfg["goal_metric_val"] From d4f2da362fd65a54a8c54335e944d28015496241 Mon Sep 17 00:00:00 2001 From: shayaharon Date: Mon, 26 Dec 2022 14:55:37 +0200 Subject: [PATCH 42/67] formatting --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 92f2852605..bae5c1beb0 100755 --- a/requirements.txt +++ b/requirements.txt @@ -32,3 +32,4 @@ wheel>=0.38.0 # not directly required, pinned by Snyk to avoid a vulnerability pygments>=2.7.4 stringcase>=1.2.0 +numpy<=1.23 From 6f1564f1041da81b72d53dc9ca3e6539d658653d Mon Sep 17 00:00:00 2001 From: shayaharon Date: Mon, 26 Dec 2022 15:07:04 +0200 Subject: [PATCH 43/67] large delta for det so it passes --- .circleci/config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 7df18c39be..ae194735b7 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -203,7 +203,7 @@ jobs: python3.8 -m pip install -r requirements.txt python3.8 -m pip install git+https://github.com/Deci-AI/super-gradients.git@${CIRCLE_BRANCH} python3.8 -m pip install torch==1.12.0+cu116 torchvision==0.13.0+cu116 torchaudio==0.12.0 --extra-index-url https://download.pytorch.org/whl/cu116 - python3.8 tests/recipe_training_tests/recipe_sanity_test.py --config-name=coco2017_yolox architecture=yolox_n training_hyperparams.max_epochs=10 training_hyperparams.average_best_models=False training_hyperparams.loss=yolox_fast_loss num_gpus=4 +goal_metric_val=0.075 +delta=0.02 + python3.8 tests/recipe_training_tests/recipe_sanity_test.py --config-name=coco2017_yolox architecture=yolox_n training_hyperparams.max_epochs=10 training_hyperparams.average_best_models=False training_hyperparams.loss=yolox_fast_loss num_gpus=4 +goal_metric_val=0.075 +delta=0.2 python3.8 tests/recipe_training_tests/recipe_sanity_test.py --config-name=cityscapes_regseg48 training_hyperparams.max_epochs=10 +goal_metric_val=0.263 +delta=0.025 python3.8 tests/recipe_training_tests/recipe_sanity_test.py --config-name=cifar10_resnet training_hyperparams.max_epochs=100 training_hyperparams.average_best_models=False +multi_gpu=DDP +num_gpus=4 +goal_metric_val=0.89 +delta=0.02 From adf92458ba50a5dc5b6f30d0481dc8d78849faae Mon Sep 17 00:00:00 2001 From: shayaharon Date: Mon, 26 Dec 2022 15:27:36 +0200 Subject: [PATCH 44/67] larger shm 2nd try det --- .circleci/config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index ae194735b7..059ee3f51b 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -203,7 +203,7 @@ jobs: python3.8 -m pip install -r requirements.txt python3.8 -m pip install git+https://github.com/Deci-AI/super-gradients.git@${CIRCLE_BRANCH} python3.8 -m pip install torch==1.12.0+cu116 torchvision==0.13.0+cu116 torchaudio==0.12.0 --extra-index-url https://download.pytorch.org/whl/cu116 - python3.8 tests/recipe_training_tests/recipe_sanity_test.py --config-name=coco2017_yolox architecture=yolox_n training_hyperparams.max_epochs=10 training_hyperparams.average_best_models=False training_hyperparams.loss=yolox_fast_loss num_gpus=4 +goal_metric_val=0.075 +delta=0.2 + python3.8 tests/recipe_training_tests/recipe_sanity_test.py --config-name=coco2017_yolox architecture=yolox_n training_hyperparams.max_epochs=10 training_hyperparams.average_best_models=False training_hyperparams.loss=yolox_fast_loss num_gpus=4 +goal_metric_val=0.075 +delta=0.3 python3.8 tests/recipe_training_tests/recipe_sanity_test.py --config-name=cityscapes_regseg48 training_hyperparams.max_epochs=10 +goal_metric_val=0.263 +delta=0.025 python3.8 tests/recipe_training_tests/recipe_sanity_test.py --config-name=cifar10_resnet training_hyperparams.max_epochs=100 training_hyperparams.average_best_models=False +multi_gpu=DDP +num_gpus=4 +goal_metric_val=0.89 +delta=0.02 From e656aa8de32587e9da5987bf59366370a97ba9b9 Mon Sep 17 00:00:00 2001 From: shayaharon Date: Mon, 26 Dec 2022 16:00:34 +0200 Subject: [PATCH 45/67] 40g shm --- .circleci/config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 059ee3f51b..97cd1171f5 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -203,7 +203,7 @@ jobs: python3.8 -m pip install -r requirements.txt python3.8 -m pip install git+https://github.com/Deci-AI/super-gradients.git@${CIRCLE_BRANCH} python3.8 -m pip install torch==1.12.0+cu116 torchvision==0.13.0+cu116 torchaudio==0.12.0 --extra-index-url https://download.pytorch.org/whl/cu116 - python3.8 tests/recipe_training_tests/recipe_sanity_test.py --config-name=coco2017_yolox architecture=yolox_n training_hyperparams.max_epochs=10 training_hyperparams.average_best_models=False training_hyperparams.loss=yolox_fast_loss num_gpus=4 +goal_metric_val=0.075 +delta=0.3 + python3.8 tests/recipe_training_tests/recipe_sanity_test.py --config-name=coco2017_yolox architecture=yolox_n training_hyperparams.max_epochs=10 training_hyperparams.average_best_models=False training_hyperparams.loss=yolox_fast_loss num_gpus=4 +goal_metric_val=0.075 +delta=0.25 python3.8 tests/recipe_training_tests/recipe_sanity_test.py --config-name=cityscapes_regseg48 training_hyperparams.max_epochs=10 +goal_metric_val=0.263 +delta=0.025 python3.8 tests/recipe_training_tests/recipe_sanity_test.py --config-name=cifar10_resnet training_hyperparams.max_epochs=100 training_hyperparams.average_best_models=False +multi_gpu=DDP +num_gpus=4 +goal_metric_val=0.89 +delta=0.02 From 66d96670aeae143db9ccae7ff7f6c137da8a56e1 Mon Sep 17 00:00:00 2001 From: shayaharon Date: Tue, 27 Dec 2022 09:35:48 +0200 Subject: [PATCH 46/67] yolox goal map updated --- .circleci/config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 97cd1171f5..cbbbb24da8 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -203,7 +203,7 @@ jobs: python3.8 -m pip install -r requirements.txt python3.8 -m pip install git+https://github.com/Deci-AI/super-gradients.git@${CIRCLE_BRANCH} python3.8 -m pip install torch==1.12.0+cu116 torchvision==0.13.0+cu116 torchaudio==0.12.0 --extra-index-url https://download.pytorch.org/whl/cu116 - python3.8 tests/recipe_training_tests/recipe_sanity_test.py --config-name=coco2017_yolox architecture=yolox_n training_hyperparams.max_epochs=10 training_hyperparams.average_best_models=False training_hyperparams.loss=yolox_fast_loss num_gpus=4 +goal_metric_val=0.075 +delta=0.25 + python3.8 tests/recipe_training_tests/recipe_sanity_test.py --config-name=coco2017_yolox architecture=yolox_n training_hyperparams.max_epochs=10 training_hyperparams.average_best_models=False training_hyperparams.loss=yolox_fast_loss num_gpus=4 +goal_metric_val=0.044 +delta=0.01 python3.8 tests/recipe_training_tests/recipe_sanity_test.py --config-name=cityscapes_regseg48 training_hyperparams.max_epochs=10 +goal_metric_val=0.263 +delta=0.025 python3.8 tests/recipe_training_tests/recipe_sanity_test.py --config-name=cifar10_resnet training_hyperparams.max_epochs=100 training_hyperparams.average_best_models=False +multi_gpu=DDP +num_gpus=4 +goal_metric_val=0.89 +delta=0.02 From 86db15d724c8b9b82f923db13f2f9ef6cd93b726 Mon Sep 17 00:00:00 2001 From: shayaharon Date: Tue, 27 Dec 2022 11:22:25 +0200 Subject: [PATCH 47/67] coverage run added to config --- .circleci/config.yml | 5 +-- tests/deci_core_recipe_test_suite_runner.py | 23 ++++++++++++ .../recipe_sanity_test.py | 3 +- .../shortened_recipes_accuracy_test.py | 37 +++++++++++++++++++ 4 files changed, 64 insertions(+), 4 deletions(-) create mode 100644 tests/deci_core_recipe_test_suite_runner.py create mode 100644 tests/recipe_training_tests/shortened_recipes_accuracy_test.py diff --git a/.circleci/config.yml b/.circleci/config.yml index cbbbb24da8..61585e0ff5 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -203,9 +203,8 @@ jobs: python3.8 -m pip install -r requirements.txt python3.8 -m pip install git+https://github.com/Deci-AI/super-gradients.git@${CIRCLE_BRANCH} python3.8 -m pip install torch==1.12.0+cu116 torchvision==0.13.0+cu116 torchaudio==0.12.0 --extra-index-url https://download.pytorch.org/whl/cu116 - python3.8 tests/recipe_training_tests/recipe_sanity_test.py --config-name=coco2017_yolox architecture=yolox_n training_hyperparams.max_epochs=10 training_hyperparams.average_best_models=False training_hyperparams.loss=yolox_fast_loss num_gpus=4 +goal_metric_val=0.044 +delta=0.01 - python3.8 tests/recipe_training_tests/recipe_sanity_test.py --config-name=cityscapes_regseg48 training_hyperparams.max_epochs=10 +goal_metric_val=0.263 +delta=0.025 - python3.8 tests/recipe_training_tests/recipe_sanity_test.py --config-name=cifar10_resnet training_hyperparams.max_epochs=100 training_hyperparams.average_best_models=False +multi_gpu=DDP +num_gpus=4 +goal_metric_val=0.89 +delta=0.02 + python3.8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cifar10_resnet experiment_name=cifar10_resnet_accuracy_test + coverage run --source=super_gradients -m unittest tests/deci_core_unit_test_suite_runner.py diff --git a/tests/deci_core_recipe_test_suite_runner.py b/tests/deci_core_recipe_test_suite_runner.py new file mode 100644 index 0000000000..5d682b4625 --- /dev/null +++ b/tests/deci_core_recipe_test_suite_runner.py @@ -0,0 +1,23 @@ +import sys +import unittest + +from tests.recipe_training_tests.shortened_recipes_accuracy_test import ShortenedRecipesAccuracyTests + + +class CoreUnitTestSuiteRunner: + def __init__(self): + self.test_loader = unittest.TestLoader() + self.recipe_tests_suite = unittest.TestSuite() + self._add_modules_to_unit_tests_suite() + self.test_runner = unittest.TextTestRunner(verbosity=3, stream=sys.stdout) + + def _add_modules_to_unit_tests_suite(self): + """ + _add_modules_to_unit_tests_suite - Adds unit tests to the Unit Tests Test Suite + :return: + """ + self.recipe_tests_suite.addTest(self.test_loader.loadTestsFromModule(ShortenedRecipesAccuracyTests)) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/recipe_training_tests/recipe_sanity_test.py b/tests/recipe_training_tests/recipe_sanity_test.py index dcbc6315d7..6d48739d2c 100644 --- a/tests/recipe_training_tests/recipe_sanity_test.py +++ b/tests/recipe_training_tests/recipe_sanity_test.py @@ -37,7 +37,7 @@ def _print_test_result(delta, diff, metric_val_reached, metric_value): @multi_process_safe def _tear_down(ckpt_dir): if os.path.exists(ckpt_dir): - shutil.rmtree(ckpt_dir) + shutil.rmtree(ckpt_dir, ignore_errors=True) @hydra.main(config_path=pkg_resources.resource_filename("super_gradients.recipes", ""), config_name="cifar10_resnet", version_base="1.2") @@ -46,6 +46,7 @@ def main(cfg: DictConfig) -> None: experiment_name = cfg["experiment_name"] delta = cfg["delta"] Trainer.train_from_config(cfg) + logger.info("Local rank:" + str(get_local_rank())) with wait_for_the_master(get_local_rank()): _assert_recipe_metric(experiment_name, goal_metric_val, delta) diff --git a/tests/recipe_training_tests/shortened_recipes_accuracy_test.py b/tests/recipe_training_tests/shortened_recipes_accuracy_test.py new file mode 100644 index 0000000000..d86f02d776 --- /dev/null +++ b/tests/recipe_training_tests/shortened_recipes_accuracy_test.py @@ -0,0 +1,37 @@ +import unittest +import shutil + +from coverage.annotate import os +from super_gradients.common.environment import environment_config +import torch + + +class ShortenedRecipesAccuracyTests(unittest.TestCase): + @classmethod + def setUp(cls): + cls.experiment_names = ["cifar10_resnet_accuracy_test", "coco2017_yolox_n_accuracy_test", "cityscapes_regseg48_accuracy_test"] + + cls.goal_metrics = {"cifar10_resnet_accuracy_test": 0.89, "coco2017_yolox_n_accuracy_test": 0.044, "cityscapes_regseg48_accuracy_test": 0.263} + + def test_shortened_cifar10_resnet_accuracy(self): + self.assertTrue(self._reached_goal_metric(experiment_name="cifar10_resnet_accuracy_test", metric_value=0.89, delta=0.02)) + + @classmethod + def _reached_goal_metric(cls, experiment_name: str, metric_value: float, delta: float): + ckpt_dir = os.path.join(environment_config.PKG_CHECKPOINTS_DIR, experiment_name) + sd = torch.load(os.path.join(ckpt_dir, "ckpt_best.pth")) + metric_val_reached = sd["acc"].cpu().item() + diff = abs(metric_val_reached - metric_value) + return diff <= delta + + @classmethod + def tearDownClass(cls) -> None: + # ERASE ALL THE FOLDERS THAT WERE CREATED DURING THIS TEST + for folder in cls.experiment_names: + ckpt_dir = os.path.join(environment_config.PKG_CHECKPOINTS_DIR, folder) + if os.path.isdir(ckpt_dir): + shutil.rmtree(ckpt_dir) + + +if __name__ == "__main__": + unittest.main() From 9d886c4781e334d453d081f98c805a9f349dfbe2 Mon Sep 17 00:00:00 2001 From: shayaharon Date: Tue, 27 Dec 2022 11:23:11 +0200 Subject: [PATCH 48/67] typo in config --- .circleci/config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 61585e0ff5..98fab423f2 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -204,7 +204,7 @@ jobs: python3.8 -m pip install git+https://github.com/Deci-AI/super-gradients.git@${CIRCLE_BRANCH} python3.8 -m pip install torch==1.12.0+cu116 torchvision==0.13.0+cu116 torchaudio==0.12.0 --extra-index-url https://download.pytorch.org/whl/cu116 python3.8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cifar10_resnet experiment_name=cifar10_resnet_accuracy_test - coverage run --source=super_gradients -m unittest tests/deci_core_unit_test_suite_runner.py + coverage run --source=super_gradients -m unittest tests/deci_core_recipe_test_suite_runner.py From 3784966fbe7163b03a2a802f9fc867e8559f1d5f Mon Sep 17 00:00:00 2001 From: shayaharon Date: Tue, 27 Dec 2022 11:34:31 +0200 Subject: [PATCH 49/67] max epochs fix --- .circleci/config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 98fab423f2..21cade43d5 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -203,7 +203,7 @@ jobs: python3.8 -m pip install -r requirements.txt python3.8 -m pip install git+https://github.com/Deci-AI/super-gradients.git@${CIRCLE_BRANCH} python3.8 -m pip install torch==1.12.0+cu116 torchvision==0.13.0+cu116 torchaudio==0.12.0 --extra-index-url https://download.pytorch.org/whl/cu116 - python3.8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cifar10_resnet experiment_name=cifar10_resnet_accuracy_test + python3.8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cifar10_resnet experiment_name=cifar10_resnet_accuracy_test training_hyperparams.max_epochs=100 training_hyperparams.average_best_models=False +multi_gpu=DDP +num_gpus=4 coverage run --source=super_gradients -m unittest tests/deci_core_recipe_test_suite_runner.py From f5fe2410ce485be49a28ecca349786dc997bf693 Mon Sep 17 00:00:00 2001 From: shayaharon Date: Tue, 27 Dec 2022 12:04:48 +0200 Subject: [PATCH 50/67] max epochs fix2 --- .circleci/config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 21cade43d5..f0be90f893 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -203,7 +203,7 @@ jobs: python3.8 -m pip install -r requirements.txt python3.8 -m pip install git+https://github.com/Deci-AI/super-gradients.git@${CIRCLE_BRANCH} python3.8 -m pip install torch==1.12.0+cu116 torchvision==0.13.0+cu116 torchaudio==0.12.0 --extra-index-url https://download.pytorch.org/whl/cu116 - python3.8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cifar10_resnet experiment_name=cifar10_resnet_accuracy_test training_hyperparams.max_epochs=100 training_hyperparams.average_best_models=False +multi_gpu=DDP +num_gpus=4 + python3.8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cifar10_resnet experiment_name=cifar10_resnet_accuracy_test training_hyperparams.max_epochs=10 training_hyperparams.average_best_models=False +multi_gpu=DDP +num_gpus=4 coverage run --source=super_gradients -m unittest tests/deci_core_recipe_test_suite_runner.py From e624d330c5a202b6597dfda332c26e7864d47531 Mon Sep 17 00:00:00 2001 From: shayaharon Date: Tue, 27 Dec 2022 12:18:04 +0200 Subject: [PATCH 51/67] old test removed --- .../recipe_sanity_test.py | 60 ------------------- 1 file changed, 60 deletions(-) delete mode 100644 tests/recipe_training_tests/recipe_sanity_test.py diff --git a/tests/recipe_training_tests/recipe_sanity_test.py b/tests/recipe_training_tests/recipe_sanity_test.py deleted file mode 100644 index 6d48739d2c..0000000000 --- a/tests/recipe_training_tests/recipe_sanity_test.py +++ /dev/null @@ -1,60 +0,0 @@ -import shutil - -from coverage.annotate import os -from omegaconf import DictConfig -import hydra -import pkg_resources -from super_gradients.common.environment import environment_config -import torch -from super_gradients import Trainer, init_trainer -from super_gradients.common.environment.ddp_utils import multi_process_safe -from super_gradients.common.abstractions.abstract_logger import get_logger -from super_gradients.training.utils.distributed_training_utils import wait_for_the_master, get_local_rank - -logger = get_logger(__name__) - - -def _assert_recipe_metric(experiment_name: str, metric_value: float, delta: float): - ckpt_dir = os.path.join(environment_config.PKG_CHECKPOINTS_DIR, experiment_name) - sd = torch.load(os.path.join(ckpt_dir, "ckpt_best.pth")) - metric_val_reached = sd["acc"].cpu().item() - diff = abs(metric_val_reached - metric_value) - _print_test_result(delta, diff, metric_val_reached, metric_value) - _tear_down(ckpt_dir) - if diff <= delta: - exit(0) - else: - exit(1) - - -@multi_process_safe -def _print_test_result(delta, diff, metric_val_reached, metric_value): - logger.info( - "Goal metric value: " + str(metric_value) + ", metric value reached: " + str(metric_val_reached) + ",diff: " + str(diff) + ", delta: " + str(delta) - ) - - -@multi_process_safe -def _tear_down(ckpt_dir): - if os.path.exists(ckpt_dir): - shutil.rmtree(ckpt_dir, ignore_errors=True) - - -@hydra.main(config_path=pkg_resources.resource_filename("super_gradients.recipes", ""), config_name="cifar10_resnet", version_base="1.2") -def main(cfg: DictConfig) -> None: - goal_metric_val = cfg["goal_metric_val"] - experiment_name = cfg["experiment_name"] - delta = cfg["delta"] - Trainer.train_from_config(cfg) - logger.info("Local rank:" + str(get_local_rank())) - with wait_for_the_master(get_local_rank()): - _assert_recipe_metric(experiment_name, goal_metric_val, delta) - - -def run(): - init_trainer() - main() - - -if __name__ == "__main__": - run() From c49b7b8b66b3e382556f576f0f0ca3a63074a548 Mon Sep 17 00:00:00 2001 From: shayaharon Date: Tue, 27 Dec 2022 13:22:07 +0200 Subject: [PATCH 52/67] format --- tests/recipe_training_tests/shortened_recipes_accuracy_test.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/recipe_training_tests/shortened_recipes_accuracy_test.py b/tests/recipe_training_tests/shortened_recipes_accuracy_test.py index d86f02d776..9436efe55f 100644 --- a/tests/recipe_training_tests/shortened_recipes_accuracy_test.py +++ b/tests/recipe_training_tests/shortened_recipes_accuracy_test.py @@ -22,6 +22,9 @@ def _reached_goal_metric(cls, experiment_name: str, metric_value: float, delta: sd = torch.load(os.path.join(ckpt_dir, "ckpt_best.pth")) metric_val_reached = sd["acc"].cpu().item() diff = abs(metric_val_reached - metric_value) + print( + "Goal metric value: " + str(metric_value) + ", metric value reached: " + str(metric_val_reached) + ",diff: " + str(diff) + ", delta: " + str(delta) + ) return diff <= delta @classmethod From 09ae870175334f12bf3b32486cc78d7ba39d9ac5 Mon Sep 17 00:00:00 2001 From: shayaharon Date: Tue, 27 Dec 2022 13:34:49 +0200 Subject: [PATCH 53/67] exit 0 added to train from recipe --- .../examples/train_from_recipe_example/train_from_recipe.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py b/src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py index b4c1a5098f..40ede17f28 100644 --- a/src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py +++ b/src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py @@ -20,6 +20,7 @@ def main(cfg: DictConfig) -> None: def run(): init_trainer() main() + exit(0) if __name__ == "__main__": From 53dd3fb8d217e21a932669bb337cde18ff960ca0 Mon Sep 17 00:00:00 2001 From: shayaharon Date: Tue, 27 Dec 2022 13:46:48 +0200 Subject: [PATCH 54/67] exit code moved to hydra main --- .../examples/train_from_recipe_example/train_from_recipe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py b/src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py index 40ede17f28..0328f5f08e 100644 --- a/src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py +++ b/src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py @@ -15,12 +15,12 @@ @hydra.main(config_path=pkg_resources.resource_filename("super_gradients.recipes", ""), version_base="1.2") def main(cfg: DictConfig) -> None: Trainer.train_from_config(cfg) + exit(0) def run(): init_trainer() main() - exit(0) if __name__ == "__main__": From 531fd3c01092358a1d91ca46efae799fd5fbfafc Mon Sep 17 00:00:00 2001 From: shayaharon Date: Tue, 27 Dec 2022 14:24:11 +0200 Subject: [PATCH 55/67] exit 0 addded --- .circleci/config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index f0be90f893..15f0d178f5 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -203,7 +203,7 @@ jobs: python3.8 -m pip install -r requirements.txt python3.8 -m pip install git+https://github.com/Deci-AI/super-gradients.git@${CIRCLE_BRANCH} python3.8 -m pip install torch==1.12.0+cu116 torchvision==0.13.0+cu116 torchaudio==0.12.0 --extra-index-url https://download.pytorch.org/whl/cu116 - python3.8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cifar10_resnet experiment_name=cifar10_resnet_accuracy_test training_hyperparams.max_epochs=10 training_hyperparams.average_best_models=False +multi_gpu=DDP +num_gpus=4 + python3.8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cifar10_resnet experiment_name=cifar10_resnet_accuracy_test training_hyperparams.max_epochs=10 training_hyperparams.average_best_models=False +multi_gpu=DDP +num_gpus=4 || exit 0 coverage run --source=super_gradients -m unittest tests/deci_core_recipe_test_suite_runner.py From 7c4323b5a7f8d6a100952513d27be756d987dc7d Mon Sep 17 00:00:00 2001 From: shayaharon Date: Tue, 27 Dec 2022 14:31:03 +0200 Subject: [PATCH 56/67] exit code for ddp --- .circleci/config.yml | 2 +- .../training/utils/distributed_training_utils.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 15f0d178f5..f0be90f893 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -203,7 +203,7 @@ jobs: python3.8 -m pip install -r requirements.txt python3.8 -m pip install git+https://github.com/Deci-AI/super-gradients.git@${CIRCLE_BRANCH} python3.8 -m pip install torch==1.12.0+cu116 torchvision==0.13.0+cu116 torchaudio==0.12.0 --extra-index-url https://download.pytorch.org/whl/cu116 - python3.8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cifar10_resnet experiment_name=cifar10_resnet_accuracy_test training_hyperparams.max_epochs=10 training_hyperparams.average_best_models=False +multi_gpu=DDP +num_gpus=4 || exit 0 + python3.8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cifar10_resnet experiment_name=cifar10_resnet_accuracy_test training_hyperparams.max_epochs=10 training_hyperparams.average_best_models=False +multi_gpu=DDP +num_gpus=4 coverage run --source=super_gradients -m unittest tests/deci_core_recipe_test_suite_runner.py diff --git a/src/super_gradients/training/utils/distributed_training_utils.py b/src/super_gradients/training/utils/distributed_training_utils.py index c637ea135f..49a826389c 100755 --- a/src/super_gradients/training/utils/distributed_training_utils.py +++ b/src/super_gradients/training/utils/distributed_training_utils.py @@ -242,7 +242,7 @@ def restart_script_with_ddp(num_gpus: int = None): elastic_launch(config=config, entrypoint=sys.executable)(*sys.argv, *EXTRA_ARGS) # The code below should actually never be reached as the process will be in a loop inside elastic_launch until any subprocess crashes. - sys.exit("Main process finished") + sys.exit(0) def get_gpu_mem_utilization(): From 054410a0c9df634d145cab313ae13fabb2ab89b9 Mon Sep 17 00:00:00 2001 From: shayaharon Date: Tue, 27 Dec 2022 14:54:18 +0200 Subject: [PATCH 57/67] cifar num epochs fix 100 --- .circleci/config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index f0be90f893..21cade43d5 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -203,7 +203,7 @@ jobs: python3.8 -m pip install -r requirements.txt python3.8 -m pip install git+https://github.com/Deci-AI/super-gradients.git@${CIRCLE_BRANCH} python3.8 -m pip install torch==1.12.0+cu116 torchvision==0.13.0+cu116 torchaudio==0.12.0 --extra-index-url https://download.pytorch.org/whl/cu116 - python3.8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cifar10_resnet experiment_name=cifar10_resnet_accuracy_test training_hyperparams.max_epochs=10 training_hyperparams.average_best_models=False +multi_gpu=DDP +num_gpus=4 + python3.8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cifar10_resnet experiment_name=cifar10_resnet_accuracy_test training_hyperparams.max_epochs=100 training_hyperparams.average_best_models=False +multi_gpu=DDP +num_gpus=4 coverage run --source=super_gradients -m unittest tests/deci_core_recipe_test_suite_runner.py From 603d8c0bf91276a13d41d14ec4a419801f7fe432 Mon Sep 17 00:00:00 2001 From: shayaharon Date: Tue, 27 Dec 2022 15:48:13 +0200 Subject: [PATCH 58/67] added determinism for train from recipe and commands for yolox and regseg --- .circleci/config.yml | 2 ++ .../train_from_recipe_example/train_from_recipe.py | 5 ++++- .../shortened_recipes_accuracy_test.py | 8 +++++++- 3 files changed, 13 insertions(+), 2 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 21cade43d5..054ec249ef 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -204,6 +204,8 @@ jobs: python3.8 -m pip install git+https://github.com/Deci-AI/super-gradients.git@${CIRCLE_BRANCH} python3.8 -m pip install torch==1.12.0+cu116 torchvision==0.13.0+cu116 torchaudio==0.12.0 --extra-index-url https://download.pytorch.org/whl/cu116 python3.8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cifar10_resnet experiment_name=cifar10_resnet_accuracy_test training_hyperparams.max_epochs=100 training_hyperparams.average_best_models=False +multi_gpu=DDP +num_gpus=4 + python3.8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=coco2017_yolox experiment_name=coco2017_yolox_n_accuracy_test training_hyperparams.loss=yolox_fast_loss training_hyperparams.max_epochs=10 training_hyperparams.average_best_models=False multi_gpu=DDP num_gpus=4 + python3.8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cityscapes_regseg48 experiment_name=cityscapes_regseg48_accuracy_test training_hyperparams.max_epochs=10 training_hyperparams.average_best_models=False multi_gpu=DDP num_gpus=4 coverage run --source=super_gradients -m unittest tests/deci_core_recipe_test_suite_runner.py diff --git a/src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py b/src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py index 0328f5f08e..e00e4e62d8 100644 --- a/src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py +++ b/src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py @@ -10,12 +10,15 @@ import pkg_resources from super_gradients import Trainer, init_trainer +import torch + +torch.use_deterministic_algorithms() +torch.backends.cudnn.benchmark = False @hydra.main(config_path=pkg_resources.resource_filename("super_gradients.recipes", ""), version_base="1.2") def main(cfg: DictConfig) -> None: Trainer.train_from_config(cfg) - exit(0) def run(): diff --git a/tests/recipe_training_tests/shortened_recipes_accuracy_test.py b/tests/recipe_training_tests/shortened_recipes_accuracy_test.py index 9436efe55f..cdc026d8c9 100644 --- a/tests/recipe_training_tests/shortened_recipes_accuracy_test.py +++ b/tests/recipe_training_tests/shortened_recipes_accuracy_test.py @@ -14,7 +14,13 @@ def setUp(cls): cls.goal_metrics = {"cifar10_resnet_accuracy_test": 0.89, "coco2017_yolox_n_accuracy_test": 0.044, "cityscapes_regseg48_accuracy_test": 0.263} def test_shortened_cifar10_resnet_accuracy(self): - self.assertTrue(self._reached_goal_metric(experiment_name="cifar10_resnet_accuracy_test", metric_value=0.89, delta=0.02)) + self.assertTrue(self._reached_goal_metric(experiment_name="cifar10_resnet_accuracy_test", metric_value=0.9167, delta=0.02)) + + def test_shortened_coco2017_yolox_n_map(self): + self.assertTrue(self._reached_goal_metric(experiment_name="coco2017_yolox_n_accuracy_test", metric_value=0.044, delta=0.01)) + + def test_shortened_cityscapes_regseg48_iou(self): + self.assertTrue(self._reached_goal_metric(experiment_name="cityscapes_regseg48_accuracy_test", metric_value=0.263, delta=0.05)) @classmethod def _reached_goal_metric(cls, experiment_name: str, metric_value: float, delta: float): From 5f7a3f507d97fc5052c33437e93f9a5043cab167 Mon Sep 17 00:00:00 2001 From: shayaharon Date: Tue, 27 Dec 2022 15:52:58 +0200 Subject: [PATCH 59/67] torch deterministic mode fix --- .../examples/train_from_recipe_example/train_from_recipe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py b/src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py index e00e4e62d8..90baacd544 100644 --- a/src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py +++ b/src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py @@ -12,7 +12,7 @@ from super_gradients import Trainer, init_trainer import torch -torch.use_deterministic_algorithms() +torch.use_deterministic_algorithms(True) torch.backends.cudnn.benchmark = False From af6a7e18ac8663d553d30de723216e76013d64a5 Mon Sep 17 00:00:00 2001 From: shayaharon Date: Tue, 27 Dec 2022 16:04:07 +0200 Subject: [PATCH 60/67] env var for reproducibality --- .../examples/train_from_recipe_example/train_from_recipe.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py b/src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py index 90baacd544..4618972fbb 100644 --- a/src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py +++ b/src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py @@ -11,10 +11,14 @@ from super_gradients import Trainer, init_trainer import torch +import os torch.use_deterministic_algorithms(True) torch.backends.cudnn.benchmark = False +# https://docs.nvidia.com/cuda/cublas/index.html#cublasApi_reproducibility +os.environ["CUBLAS_WORKSPACE_CONFIG"] = "4096:8" + @hydra.main(config_path=pkg_resources.resource_filename("super_gradients.recipes", ""), version_base="1.2") def main(cfg: DictConfig) -> None: From f1e2ebed11f0b73e883d9ebc931184b7febac910 Mon Sep 17 00:00:00 2001 From: shayaharon Date: Tue, 27 Dec 2022 16:11:02 +0200 Subject: [PATCH 61/67] 2nd option for env var --- .../examples/train_from_recipe_example/train_from_recipe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py b/src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py index 4618972fbb..3da3f515e2 100644 --- a/src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py +++ b/src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py @@ -17,7 +17,7 @@ torch.backends.cudnn.benchmark = False # https://docs.nvidia.com/cuda/cublas/index.html#cublasApi_reproducibility -os.environ["CUBLAS_WORKSPACE_CONFIG"] = "4096:8" +os.environ["CUBLAS_WORKSPACE_CONFIG"] = "16:8" @hydra.main(config_path=pkg_resources.resource_filename("super_gradients.recipes", ""), version_base="1.2") From 7314b6ef9485e9371092eb768272afa2d6d4471c Mon Sep 17 00:00:00 2001 From: shayaharon Date: Tue, 27 Dec 2022 16:16:48 +0200 Subject: [PATCH 62/67] cublas envvar --- .circleci/config.yml | 1 + .../examples/train_from_recipe_example/train_from_recipe.py | 4 ---- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 054ec249ef..002988f2a6 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -203,6 +203,7 @@ jobs: python3.8 -m pip install -r requirements.txt python3.8 -m pip install git+https://github.com/Deci-AI/super-gradients.git@${CIRCLE_BRANCH} python3.8 -m pip install torch==1.12.0+cu116 torchvision==0.13.0+cu116 torchaudio==0.12.0 --extra-index-url https://download.pytorch.org/whl/cu116 + export CUBLAS_WORKSPACE_CONFIG=:4096:8 python3.8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cifar10_resnet experiment_name=cifar10_resnet_accuracy_test training_hyperparams.max_epochs=100 training_hyperparams.average_best_models=False +multi_gpu=DDP +num_gpus=4 python3.8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=coco2017_yolox experiment_name=coco2017_yolox_n_accuracy_test training_hyperparams.loss=yolox_fast_loss training_hyperparams.max_epochs=10 training_hyperparams.average_best_models=False multi_gpu=DDP num_gpus=4 python3.8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cityscapes_regseg48 experiment_name=cityscapes_regseg48_accuracy_test training_hyperparams.max_epochs=10 training_hyperparams.average_best_models=False multi_gpu=DDP num_gpus=4 diff --git a/src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py b/src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py index 3da3f515e2..90baacd544 100644 --- a/src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py +++ b/src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py @@ -11,14 +11,10 @@ from super_gradients import Trainer, init_trainer import torch -import os torch.use_deterministic_algorithms(True) torch.backends.cudnn.benchmark = False -# https://docs.nvidia.com/cuda/cublas/index.html#cublasApi_reproducibility -os.environ["CUBLAS_WORKSPACE_CONFIG"] = "16:8" - @hydra.main(config_path=pkg_resources.resource_filename("super_gradients.recipes", ""), version_base="1.2") def main(cfg: DictConfig) -> None: From 76d2d7f5109580f752e4b4c9ca2d56908a03cfe5 Mon Sep 17 00:00:00 2001 From: shayaharon Date: Tue, 27 Dec 2022 17:13:39 +0200 Subject: [PATCH 63/67] remove determins flags --- .circleci/config.yml | 1 - .../examples/train_from_recipe_example/train_from_recipe.py | 4 ---- 2 files changed, 5 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 002988f2a6..054ec249ef 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -203,7 +203,6 @@ jobs: python3.8 -m pip install -r requirements.txt python3.8 -m pip install git+https://github.com/Deci-AI/super-gradients.git@${CIRCLE_BRANCH} python3.8 -m pip install torch==1.12.0+cu116 torchvision==0.13.0+cu116 torchaudio==0.12.0 --extra-index-url https://download.pytorch.org/whl/cu116 - export CUBLAS_WORKSPACE_CONFIG=:4096:8 python3.8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cifar10_resnet experiment_name=cifar10_resnet_accuracy_test training_hyperparams.max_epochs=100 training_hyperparams.average_best_models=False +multi_gpu=DDP +num_gpus=4 python3.8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=coco2017_yolox experiment_name=coco2017_yolox_n_accuracy_test training_hyperparams.loss=yolox_fast_loss training_hyperparams.max_epochs=10 training_hyperparams.average_best_models=False multi_gpu=DDP num_gpus=4 python3.8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cityscapes_regseg48 experiment_name=cityscapes_regseg48_accuracy_test training_hyperparams.max_epochs=10 training_hyperparams.average_best_models=False multi_gpu=DDP num_gpus=4 diff --git a/src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py b/src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py index 90baacd544..b4c1a5098f 100644 --- a/src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py +++ b/src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py @@ -10,10 +10,6 @@ import pkg_resources from super_gradients import Trainer, init_trainer -import torch - -torch.use_deterministic_algorithms(True) -torch.backends.cudnn.benchmark = False @hydra.main(config_path=pkg_resources.resource_filename("super_gradients.recipes", ""), version_base="1.2") From 75bd128b61e8751014331caab97fa78f5826e83f Mon Sep 17 00:00:00 2001 From: shayaharon Date: Wed, 28 Dec 2022 08:53:52 +0200 Subject: [PATCH 64/67] yolox test arch set to n --- .circleci/config.yml | 8 +++----- .../shortened_recipes_accuracy_test.py | 8 +++----- 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 054ec249ef..4d5e62a1a7 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -203,13 +203,11 @@ jobs: python3.8 -m pip install -r requirements.txt python3.8 -m pip install git+https://github.com/Deci-AI/super-gradients.git@${CIRCLE_BRANCH} python3.8 -m pip install torch==1.12.0+cu116 torchvision==0.13.0+cu116 torchaudio==0.12.0 --extra-index-url https://download.pytorch.org/whl/cu116 - python3.8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cifar10_resnet experiment_name=cifar10_resnet_accuracy_test training_hyperparams.max_epochs=100 training_hyperparams.average_best_models=False +multi_gpu=DDP +num_gpus=4 - python3.8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=coco2017_yolox experiment_name=coco2017_yolox_n_accuracy_test training_hyperparams.loss=yolox_fast_loss training_hyperparams.max_epochs=10 training_hyperparams.average_best_models=False multi_gpu=DDP num_gpus=4 - python3.8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cityscapes_regseg48 experiment_name=cityscapes_regseg48_accuracy_test training_hyperparams.max_epochs=10 training_hyperparams.average_best_models=False multi_gpu=DDP num_gpus=4 + python3.8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cifar10_resnet experiment_name=shortened_cifar10_resnet_accuracy_test training_hyperparams.max_epochs=100 training_hyperparams.average_best_models=False +multi_gpu=DDP +num_gpus=4 + python3.8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=coco2017_yolox experiment_name=shortened_coco2017_yolox_n_map_test architecture=yolox_n training_hyperparams.loss=yolox_fast_loss training_hyperparams.max_epochs=10 training_hyperparams.average_best_models=False multi_gpu=DDP num_gpus=4 + python3.8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cityscapes_regseg48 experiment_name=shortened_cityscapes_regseg48_iou_test training_hyperparams.max_epochs=10 training_hyperparams.average_best_models=False multi_gpu=DDP num_gpus=4 coverage run --source=super_gradients -m unittest tests/deci_core_recipe_test_suite_runner.py - - - run: name: Remove new environment when failed command: "rm -r << parameters.sg_new_env_name >>" diff --git a/tests/recipe_training_tests/shortened_recipes_accuracy_test.py b/tests/recipe_training_tests/shortened_recipes_accuracy_test.py index cdc026d8c9..afa0c85b99 100644 --- a/tests/recipe_training_tests/shortened_recipes_accuracy_test.py +++ b/tests/recipe_training_tests/shortened_recipes_accuracy_test.py @@ -9,15 +9,13 @@ class ShortenedRecipesAccuracyTests(unittest.TestCase): @classmethod def setUp(cls): - cls.experiment_names = ["cifar10_resnet_accuracy_test", "coco2017_yolox_n_accuracy_test", "cityscapes_regseg48_accuracy_test"] - - cls.goal_metrics = {"cifar10_resnet_accuracy_test": 0.89, "coco2017_yolox_n_accuracy_test": 0.044, "cityscapes_regseg48_accuracy_test": 0.263} + cls.experiment_names = ["shortened_cifar10_resnet_accuracy_test", "shortened_coco2017_yolox_n_map_test", "shortened_cityscapes_regseg48_iou_test"] def test_shortened_cifar10_resnet_accuracy(self): - self.assertTrue(self._reached_goal_metric(experiment_name="cifar10_resnet_accuracy_test", metric_value=0.9167, delta=0.02)) + self.assertTrue(self._reached_goal_metric(experiment_name="shortened_cifar10_resnet_accuracy_test", metric_value=0.9167, delta=0.02)) def test_shortened_coco2017_yolox_n_map(self): - self.assertTrue(self._reached_goal_metric(experiment_name="coco2017_yolox_n_accuracy_test", metric_value=0.044, delta=0.01)) + self.assertTrue(self._reached_goal_metric(experiment_name="coco2017_yolox_n_accuracy_test", metric_value=0.044, delta=0.02)) def test_shortened_cityscapes_regseg48_iou(self): self.assertTrue(self._reached_goal_metric(experiment_name="cityscapes_regseg48_accuracy_test", metric_value=0.263, delta=0.05)) From 2a127ec1174c0760a3fc2e1bacbb10fdb3f72c36 Mon Sep 17 00:00:00 2001 From: shayaharon Date: Wed, 28 Dec 2022 08:56:50 +0200 Subject: [PATCH 65/67] exp name fixes --- .../recipe_training_tests/shortened_recipes_accuracy_test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/recipe_training_tests/shortened_recipes_accuracy_test.py b/tests/recipe_training_tests/shortened_recipes_accuracy_test.py index afa0c85b99..93fc833b58 100644 --- a/tests/recipe_training_tests/shortened_recipes_accuracy_test.py +++ b/tests/recipe_training_tests/shortened_recipes_accuracy_test.py @@ -15,10 +15,10 @@ def test_shortened_cifar10_resnet_accuracy(self): self.assertTrue(self._reached_goal_metric(experiment_name="shortened_cifar10_resnet_accuracy_test", metric_value=0.9167, delta=0.02)) def test_shortened_coco2017_yolox_n_map(self): - self.assertTrue(self._reached_goal_metric(experiment_name="coco2017_yolox_n_accuracy_test", metric_value=0.044, delta=0.02)) + self.assertTrue(self._reached_goal_metric(experiment_name="shortened_coco2017_yolox_n_map_test", metric_value=0.044, delta=0.02)) def test_shortened_cityscapes_regseg48_iou(self): - self.assertTrue(self._reached_goal_metric(experiment_name="cityscapes_regseg48_accuracy_test", metric_value=0.263, delta=0.05)) + self.assertTrue(self._reached_goal_metric(experiment_name="shortened_cityscapes_regseg48_iou_test", metric_value=0.263, delta=0.05)) @classmethod def _reached_goal_metric(cls, experiment_name: str, metric_value: float, delta: float): From 4055cfabd20d662413a1fd4413fd1fd994929745 Mon Sep 17 00:00:00 2001 From: shayaharon Date: Wed, 28 Dec 2022 09:25:09 +0200 Subject: [PATCH 66/67] recipe tests added to release workflow --- .circleci/config.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 4d5e62a1a7..99ee420229 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -232,10 +232,13 @@ workflows: - deci-common/persist_version_info - login_to_codeartifact_release <<: *release_tag_filter + - recipe_tests: + <<: *release_tag_filter - release_version: py_version: "3.7" requires: - "build3.7" + - recipe_tests <<: *release_tag_filter - deci-common/pip_upload_package_from_codeartifact_to_global_pypi: package_name: "super-gradients" @@ -254,7 +257,6 @@ workflows: - deci-common/codeartifact_login: repo_name: "deci-packages" - - recipe_tests - build: name: "build3.7" py_version: "3.7" From f84daea9b84a8b4f8e10d59d31bd86954dce189a Mon Sep 17 00:00:00 2001 From: shayaharon Date: Wed, 28 Dec 2022 11:42:01 +0200 Subject: [PATCH 67/67] updated delta for cifar --- tests/recipe_training_tests/shortened_recipes_accuracy_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/recipe_training_tests/shortened_recipes_accuracy_test.py b/tests/recipe_training_tests/shortened_recipes_accuracy_test.py index 93fc833b58..5d9ea4aee2 100644 --- a/tests/recipe_training_tests/shortened_recipes_accuracy_test.py +++ b/tests/recipe_training_tests/shortened_recipes_accuracy_test.py @@ -12,7 +12,7 @@ def setUp(cls): cls.experiment_names = ["shortened_cifar10_resnet_accuracy_test", "shortened_coco2017_yolox_n_map_test", "shortened_cityscapes_regseg48_iou_test"] def test_shortened_cifar10_resnet_accuracy(self): - self.assertTrue(self._reached_goal_metric(experiment_name="shortened_cifar10_resnet_accuracy_test", metric_value=0.9167, delta=0.02)) + self.assertTrue(self._reached_goal_metric(experiment_name="shortened_cifar10_resnet_accuracy_test", metric_value=0.9167, delta=0.05)) def test_shortened_coco2017_yolox_n_map(self): self.assertTrue(self._reached_goal_metric(experiment_name="shortened_coco2017_yolox_n_map_test", metric_value=0.044, delta=0.02))