Skip to content

Commit 9e24a33

Browse files
shaydeciofrimasad
andauthored
Feature/sg 521 gpu tests (#587)
* workflow added * first tests added * sanity tests moved * -m removed * env var added * installation from branch added * more changes * command fix * formatt * remove env adde to recipe+tests * command fix in config * torchrun instead of python * command update * hydra full error env var * train from recipe cmd * torch installation fix * protobuf version try * lets get this running * lets get this working * let make this work2 * lets make this work 3.0 * let make this work 4.0 * lets make this work 5.0 * coco try * coco try yolox * coco try yolox fix num gpus * reordr installs * order installs + python3.8 removed * order installs + python3.8 * torch 1.12 * linter * cleanup and 11.6 * 11.6 with 2 epochs * dist launch used * dataset params lines removed * nccl debug * assert with abs, cifar rolled back * cifar recipe fix * formatting * formatter * teardown added to test + seg and det tests added * formatting * large delta for det so it passes * larger shm 2nd try det * 40g shm * yolox goal map updated * coverage run added to config * typo in config * max epochs fix * max epochs fix2 * old test removed * format * exit 0 added to train from recipe * exit code moved to hydra main * exit 0 addded * exit code for ddp * cifar num epochs fix 100 * added determinism for train from recipe and commands for yolox and regseg * torch deterministic mode fix * env var for reproducibality * 2nd option for env var * cublas envvar * remove determins flags * yolox test arch set to n * exp name fixes * recipe tests added to release workflow * updated delta for cifar Co-authored-by: Ofri Masad <[email protected]>
1 parent c3751fd commit 9e24a33

File tree

8 files changed

+115
-4
lines changed

8 files changed

+115
-4
lines changed

.circleci/config.yml

+39-1
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,6 @@ jobs:
104104
- store_artifacts:
105105
path: ~/sg_logs
106106

107-
108107
release_candidate:
109108
parameters:
110109
py_version:
@@ -180,6 +179,40 @@ jobs:
180179
tag: $CIRCLE_TAG
181180
notes: "This GitHub Release was done automatically by CircleCI"
182181

182+
recipe_tests:
183+
machine: true
184+
resource_class: deci-ai/sg-gpu-on-premise
185+
parameters:
186+
sg_existing_env_path:
187+
type: string
188+
default: "/env/persistent_env"
189+
sg_new_env_name:
190+
type: string
191+
default: "${CIRCLE_BUILD_NUM}"
192+
sg_new_env_python_version:
193+
type: string
194+
default: "python3.8"
195+
steps:
196+
- checkout
197+
- run:
198+
name: install requirements and run recipe tests
199+
command: |
200+
<< parameters.sg_new_env_python_version >> -m venv << parameters.sg_new_env_name >>
201+
source << parameters.sg_new_env_name >>/bin/activate
202+
python3.8 -m pip install --upgrade setuptools pip wheel
203+
python3.8 -m pip install -r requirements.txt
204+
python3.8 -m pip install git+https://github.com/Deci-AI/super-gradients.git@${CIRCLE_BRANCH}
205+
python3.8 -m pip install torch==1.12.0+cu116 torchvision==0.13.0+cu116 torchaudio==0.12.0 --extra-index-url https://download.pytorch.org/whl/cu116
206+
python3.8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cifar10_resnet experiment_name=shortened_cifar10_resnet_accuracy_test training_hyperparams.max_epochs=100 training_hyperparams.average_best_models=False +multi_gpu=DDP +num_gpus=4
207+
python3.8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=coco2017_yolox experiment_name=shortened_coco2017_yolox_n_map_test architecture=yolox_n training_hyperparams.loss=yolox_fast_loss training_hyperparams.max_epochs=10 training_hyperparams.average_best_models=False multi_gpu=DDP num_gpus=4
208+
python3.8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cityscapes_regseg48 experiment_name=shortened_cityscapes_regseg48_iou_test training_hyperparams.max_epochs=10 training_hyperparams.average_best_models=False multi_gpu=DDP num_gpus=4
209+
coverage run --source=super_gradients -m unittest tests/deci_core_recipe_test_suite_runner.py
210+
211+
- run:
212+
name: Remove new environment when failed
213+
command: "rm -r << parameters.sg_new_env_name >>"
214+
when: on_fail
215+
183216

184217

185218
workflows:
@@ -199,10 +232,13 @@ workflows:
199232
- deci-common/persist_version_info
200233
- login_to_codeartifact_release
201234
<<: *release_tag_filter
235+
- recipe_tests:
236+
<<: *release_tag_filter
202237
- release_version:
203238
py_version: "3.7"
204239
requires:
205240
- "build3.7"
241+
- recipe_tests
206242
<<: *release_tag_filter
207243
- deci-common/pip_upload_package_from_codeartifact_to_global_pypi:
208244
package_name: "super-gradients"
@@ -219,13 +255,15 @@ workflows:
219255
- deci-common/persist_version_info
220256
- deci-common/codeartifact_login:
221257
repo_name: "deci-packages"
258+
222259
- build:
223260
name: "build3.7"
224261
py_version: "3.7"
225262
package_name: "super-gradients"
226263
requires:
227264
- deci-common/persist_version_info
228265
- deci-common/codeartifact_login
266+
229267
- release_candidate: # happens on merge
230268
py_version: "3.7"
231269
requires:

requirements.txt

+1
Original file line numberDiff line numberDiff line change
@@ -32,3 +32,4 @@ wheel>=0.38.0
3232
# not directly required, pinned by Snyk to avoid a vulnerability
3333
pygments>=2.7.4
3434
stringcase>=1.2.0
35+
numpy<=1.23

src/super_gradients/recipes/cifar10_resnet.yaml

-1
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,6 @@ resume: False
2424
training_hyperparams:
2525
resume: ${resume}
2626

27-
2827
ckpt_root_dir:
2928

3029
architecture: resnet18_cifar

src/super_gradients/training/sg_trainer/sg_trainer.py

+7-1
Original file line numberDiff line numberDiff line change
@@ -954,7 +954,13 @@ def forward(self, inputs, targets):
954954
training_params = dict()
955955
self.train_loader = train_loader or self.train_loader
956956
self.valid_loader = valid_loader or self.valid_loader
957-
if len(self.train_loader.dataset) % self.train_loader.batch_size != 0 and not self.train_loader.drop_last:
957+
958+
if hasattr(self.train_loader, "batch_sampler") and self.train_loader.batch_sampler is not None:
959+
batch_size = self.train_loader.batch_sampler.batch_size
960+
else:
961+
batch_size = self.train_loader.batch_size
962+
963+
if len(self.train_loader.dataset) % batch_size != 0 and not self.train_loader.drop_last:
958964
logger.warning("Train dataset size % batch_size != 0 and drop_last=False, this might result in smaller " "last batch.")
959965
self._set_dataset_params()
960966

src/super_gradients/training/utils/distributed_training_utils.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -242,7 +242,7 @@ def restart_script_with_ddp(num_gpus: int = None):
242242
elastic_launch(config=config, entrypoint=sys.executable)(*sys.argv, *EXTRA_ARGS)
243243

244244
# The code below should actually never be reached as the process will be in a loop inside elastic_launch until any subprocess crashes.
245-
sys.exit("Main process finished")
245+
sys.exit(0)
246246

247247

248248
def get_gpu_mem_utilization():
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
import sys
2+
import unittest
3+
4+
from tests.recipe_training_tests.shortened_recipes_accuracy_test import ShortenedRecipesAccuracyTests
5+
6+
7+
class CoreUnitTestSuiteRunner:
8+
def __init__(self):
9+
self.test_loader = unittest.TestLoader()
10+
self.recipe_tests_suite = unittest.TestSuite()
11+
self._add_modules_to_unit_tests_suite()
12+
self.test_runner = unittest.TextTestRunner(verbosity=3, stream=sys.stdout)
13+
14+
def _add_modules_to_unit_tests_suite(self):
15+
"""
16+
_add_modules_to_unit_tests_suite - Adds unit tests to the Unit Tests Test Suite
17+
:return:
18+
"""
19+
self.recipe_tests_suite.addTest(self.test_loader.loadTestsFromModule(ShortenedRecipesAccuracyTests))
20+
21+
22+
if __name__ == "__main__":
23+
unittest.main()

tests/recipe_training_tests/__init__.py

Whitespace-only changes.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
import unittest
2+
import shutil
3+
4+
from coverage.annotate import os
5+
from super_gradients.common.environment import environment_config
6+
import torch
7+
8+
9+
class ShortenedRecipesAccuracyTests(unittest.TestCase):
10+
@classmethod
11+
def setUp(cls):
12+
cls.experiment_names = ["shortened_cifar10_resnet_accuracy_test", "shortened_coco2017_yolox_n_map_test", "shortened_cityscapes_regseg48_iou_test"]
13+
14+
def test_shortened_cifar10_resnet_accuracy(self):
15+
self.assertTrue(self._reached_goal_metric(experiment_name="shortened_cifar10_resnet_accuracy_test", metric_value=0.9167, delta=0.05))
16+
17+
def test_shortened_coco2017_yolox_n_map(self):
18+
self.assertTrue(self._reached_goal_metric(experiment_name="shortened_coco2017_yolox_n_map_test", metric_value=0.044, delta=0.02))
19+
20+
def test_shortened_cityscapes_regseg48_iou(self):
21+
self.assertTrue(self._reached_goal_metric(experiment_name="shortened_cityscapes_regseg48_iou_test", metric_value=0.263, delta=0.05))
22+
23+
@classmethod
24+
def _reached_goal_metric(cls, experiment_name: str, metric_value: float, delta: float):
25+
ckpt_dir = os.path.join(environment_config.PKG_CHECKPOINTS_DIR, experiment_name)
26+
sd = torch.load(os.path.join(ckpt_dir, "ckpt_best.pth"))
27+
metric_val_reached = sd["acc"].cpu().item()
28+
diff = abs(metric_val_reached - metric_value)
29+
print(
30+
"Goal metric value: " + str(metric_value) + ", metric value reached: " + str(metric_val_reached) + ",diff: " + str(diff) + ", delta: " + str(delta)
31+
)
32+
return diff <= delta
33+
34+
@classmethod
35+
def tearDownClass(cls) -> None:
36+
# ERASE ALL THE FOLDERS THAT WERE CREATED DURING THIS TEST
37+
for folder in cls.experiment_names:
38+
ckpt_dir = os.path.join(environment_config.PKG_CHECKPOINTS_DIR, folder)
39+
if os.path.isdir(ckpt_dir):
40+
shutil.rmtree(ckpt_dir)
41+
42+
43+
if __name__ == "__main__":
44+
unittest.main()

0 commit comments

Comments
 (0)