Feature/sg 521 gpu tests (#587)

shaydeci · ofrimasad · web-flow · commit 9e24a33f76da · 2022-12-29T09:17:50.000Z
* workflow added

* first tests added

* sanity tests moved

* -m removed

* env var added

* installation from branch added

* more changes

* command fix

* formatt

* remove env adde to recipe+tests

* command fix in config

* torchrun instead of python

* command update

* hydra full error env var

* train from recipe cmd

* torch installation fix

* protobuf version try

* lets get this running

* lets get this working

* let make this work2

* lets make this work 3.0

* let make this work 4.0

* lets make this work 5.0

* coco try

* coco try yolox

* coco try yolox fix num gpus

* reordr installs

* order installs + python3.8 removed

* order installs + python3.8

* torch 1.12

* linter

* cleanup and 11.6

* 11.6 with 2 epochs

* dist launch used

* dataset params lines removed

* nccl debug

* assert with abs, cifar rolled back

* cifar recipe fix

* formatting

* formatter

* teardown added to test + seg and det tests added

* formatting

* large delta for det so it passes

* larger shm 2nd try det

* 40g shm

* yolox goal map updated

* coverage run added to config

* typo in config

* max epochs fix

* max epochs fix2

* old test removed

* format

* exit 0 added to train from recipe

* exit code moved to hydra main

* exit 0 addded

* exit code for ddp

* cifar num epochs fix 100

* added determinism for train from recipe and commands for yolox and regseg

* torch deterministic mode fix

* env var for reproducibality

* 2nd option for env var

* cublas envvar

* remove determins flags

* yolox test arch set to n

* exp name fixes

* recipe tests added to release workflow

* updated delta for cifar

Co-authored-by: Ofri Masad &lt;ofrimasad@users.noreply.github.com&gt;
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -104,7 +104,6 @@ jobs:
       - store_artifacts:
           path: ~/sg_logs
 
-
   release_candidate:
     parameters:
       py_version:
@@ -180,6 +179,40 @@ jobs:
           tag: $CIRCLE_TAG
           notes: "This GitHub Release was done automatically by CircleCI"
 
+  recipe_tests:
+    machine: true
+    resource_class: deci-ai/sg-gpu-on-premise
+    parameters:
+      sg_existing_env_path:
+        type: string
+        default: "/env/persistent_env"
+      sg_new_env_name:
+        type: string
+        default: "${CIRCLE_BUILD_NUM}"
+      sg_new_env_python_version:
+        type: string
+        default: "python3.8"
+    steps:
+      - checkout
+      - run:
+          name: install requirements and run recipe tests
+          command: |
+            << parameters.sg_new_env_python_version >> -m venv << parameters.sg_new_env_name >>
+            source << parameters.sg_new_env_name >>/bin/activate
+            python3.8 -m pip install --upgrade setuptools pip wheel
+            python3.8 -m pip install -r requirements.txt
+            python3.8 -m pip install git+https://github.com/Deci-AI/super-gradients.git@${CIRCLE_BRANCH}
+            python3.8 -m pip install torch==1.12.0+cu116 torchvision==0.13.0+cu116 torchaudio==0.12.0 --extra-index-url https://download.pytorch.org/whl/cu116
+            python3.8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cifar10_resnet experiment_name=shortened_cifar10_resnet_accuracy_test training_hyperparams.max_epochs=100 training_hyperparams.average_best_models=False +multi_gpu=DDP +num_gpus=4
+            python3.8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=coco2017_yolox experiment_name=shortened_coco2017_yolox_n_map_test architecture=yolox_n training_hyperparams.loss=yolox_fast_loss training_hyperparams.max_epochs=10 training_hyperparams.average_best_models=False multi_gpu=DDP num_gpus=4
+            python3.8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cityscapes_regseg48 experiment_name=shortened_cityscapes_regseg48_iou_test training_hyperparams.max_epochs=10 training_hyperparams.average_best_models=False multi_gpu=DDP num_gpus=4
+            coverage run --source=super_gradients -m unittest tests/deci_core_recipe_test_suite_runner.py
+
+      - run:
+          name: Remove new environment when failed
+          command: "rm -r << parameters.sg_new_env_name >>"
+          when: on_fail
+
 
 
 workflows:
@@ -199,10 +232,13 @@ workflows:
             - deci-common/persist_version_info
             - login_to_codeartifact_release
           <<: *release_tag_filter
+      - recipe_tests:
+          <<: *release_tag_filter
       - release_version:
           py_version: "3.7"
           requires:
             - "build3.7"
+            - recipe_tests
           <<: *release_tag_filter
       - deci-common/pip_upload_package_from_codeartifact_to_global_pypi:
           package_name: "super-gradients"
@@ -219,13 +255,15 @@ workflows:
       - deci-common/persist_version_info
       - deci-common/codeartifact_login:
           repo_name: "deci-packages"
+
       - build:
           name: "build3.7"
           py_version: "3.7"
           package_name: "super-gradients"
           requires:
             - deci-common/persist_version_info
             - deci-common/codeartifact_login
+
       - release_candidate: # happens on merge
           py_version: "3.7"
           requires:
diff --git a/requirements.txt b/requirements.txt
@@ -32,3 +32,4 @@ wheel>=0.38.0
 # not directly required, pinned by Snyk to avoid a vulnerability
 pygments>=2.7.4
 stringcase>=1.2.0
+numpy<=1.23
diff --git a/src/super_gradients/recipes/cifar10_resnet.yaml b/src/super_gradients/recipes/cifar10_resnet.yaml
@@ -24,7 +24,6 @@ resume: False
 training_hyperparams:
   resume: ${resume}
 
-
 ckpt_root_dir:
 
 architecture: resnet18_cifar
diff --git a/src/super_gradients/training/sg_trainer/sg_trainer.py b/src/super_gradients/training/sg_trainer/sg_trainer.py
@@ -954,7 +954,13 @@ def forward(self, inputs, targets):
             training_params = dict()
         self.train_loader = train_loader or self.train_loader
         self.valid_loader = valid_loader or self.valid_loader
-        if len(self.train_loader.dataset) % self.train_loader.batch_size != 0 and not self.train_loader.drop_last:
+
+        if hasattr(self.train_loader, "batch_sampler") and self.train_loader.batch_sampler is not None:
+            batch_size = self.train_loader.batch_sampler.batch_size
+        else:
+            batch_size = self.train_loader.batch_size
+
+        if len(self.train_loader.dataset) % batch_size != 0 and not self.train_loader.drop_last:
             logger.warning("Train dataset size % batch_size != 0 and drop_last=False, this might result in smaller " "last batch.")
         self._set_dataset_params()
 
diff --git a/src/super_gradients/training/utils/distributed_training_utils.py b/src/super_gradients/training/utils/distributed_training_utils.py
@@ -242,7 +242,7 @@ def restart_script_with_ddp(num_gpus: int = None):
     elastic_launch(config=config, entrypoint=sys.executable)(*sys.argv, *EXTRA_ARGS)
 
     # The code below should actually never be reached as the process will be in a loop inside elastic_launch until any subprocess crashes.
-    sys.exit("Main process finished")
+    sys.exit(0)
 
 
 def get_gpu_mem_utilization():
diff --git a/tests/deci_core_recipe_test_suite_runner.py b/tests/deci_core_recipe_test_suite_runner.py
@@ -0,0 +1,23 @@
+import sys
+import unittest
+
+from tests.recipe_training_tests.shortened_recipes_accuracy_test import ShortenedRecipesAccuracyTests
+
+
+class CoreUnitTestSuiteRunner:
+    def __init__(self):
+        self.test_loader = unittest.TestLoader()
+        self.recipe_tests_suite = unittest.TestSuite()
+        self._add_modules_to_unit_tests_suite()
+        self.test_runner = unittest.TextTestRunner(verbosity=3, stream=sys.stdout)
+
+    def _add_modules_to_unit_tests_suite(self):
+        """
+        _add_modules_to_unit_tests_suite - Adds unit tests to the Unit Tests Test Suite
+            :return:
+        """
+        self.recipe_tests_suite.addTest(self.test_loader.loadTestsFromModule(ShortenedRecipesAccuracyTests))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/recipe_training_tests/__init__.py b/tests/recipe_training_tests/__init__.py
diff --git a/tests/recipe_training_tests/shortened_recipes_accuracy_test.py b/tests/recipe_training_tests/shortened_recipes_accuracy_test.py
@@ -0,0 +1,44 @@
+import unittest
+import shutil
+
+from coverage.annotate import os
+from super_gradients.common.environment import environment_config
+import torch
+
+
+class ShortenedRecipesAccuracyTests(unittest.TestCase):
+    @classmethod
+    def setUp(cls):
+        cls.experiment_names = ["shortened_cifar10_resnet_accuracy_test", "shortened_coco2017_yolox_n_map_test", "shortened_cityscapes_regseg48_iou_test"]
+
+    def test_shortened_cifar10_resnet_accuracy(self):
+        self.assertTrue(self._reached_goal_metric(experiment_name="shortened_cifar10_resnet_accuracy_test", metric_value=0.9167, delta=0.05))
+
+    def test_shortened_coco2017_yolox_n_map(self):
+        self.assertTrue(self._reached_goal_metric(experiment_name="shortened_coco2017_yolox_n_map_test", metric_value=0.044, delta=0.02))
+
+    def test_shortened_cityscapes_regseg48_iou(self):
+        self.assertTrue(self._reached_goal_metric(experiment_name="shortened_cityscapes_regseg48_iou_test", metric_value=0.263, delta=0.05))
+
+    @classmethod
+    def _reached_goal_metric(cls, experiment_name: str, metric_value: float, delta: float):
+        ckpt_dir = os.path.join(environment_config.PKG_CHECKPOINTS_DIR, experiment_name)
+        sd = torch.load(os.path.join(ckpt_dir, "ckpt_best.pth"))
+        metric_val_reached = sd["acc"].cpu().item()
+        diff = abs(metric_val_reached - metric_value)
+        print(
+            "Goal metric value: " + str(metric_value) + ", metric value reached: " + str(metric_val_reached) + ",diff: " + str(diff) + ", delta: " + str(delta)
+        )
+        return diff <= delta
+
+    @classmethod
+    def tearDownClass(cls) -> None:
+        # ERASE ALL THE FOLDERS THAT WERE CREATED DURING THIS TEST
+        for folder in cls.experiment_names:
+            ckpt_dir = os.path.join(environment_config.PKG_CHECKPOINTS_DIR, folder)
+            if os.path.isdir(ckpt_dir):
+                shutil.rmtree(ckpt_dir)
+
+
+if __name__ == "__main__":
+    unittest.main()