Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

separate outputs #334

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/unit_test_4gpu.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,8 @@ jobs:
python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu121
python -m pip install -r requirements.txt
python -m pip install -r dev-requirements.txt
python ./test_runner.py
mv outputs artifacts-to-be-uploaded
mkdir artifacts-to-be-uploaded
python ./test_runner.py artifacts-to-be-uploaded
# upload-coverage:
# - name: Upload Coverage to Codecov
# uses: codecov/codecov-action@v3
51 changes: 30 additions & 21 deletions test_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,8 @@
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

import glob
import argparse
import os
import shutil
import subprocess
from collections import defaultdict
from dataclasses import dataclass
Expand All @@ -18,6 +17,11 @@
import tomli as tomllib


parser = argparse.ArgumentParser()
parser.add_argument("output_dir")
args = parser.parse_args()


@dataclass
class OverrideDefinitions:
"""
Expand All @@ -29,7 +33,6 @@ class OverrideDefinitions:


CONFIG_DIR = "./train_configs"
test_checkpoint_dir = "./test_runner_checkpoint"

"""
key is the config file name and value is a list of OverrideDefinitions
Expand All @@ -40,25 +43,39 @@ class OverrideDefinitions:
integration_tests_flavors["debug_model.toml"] = [
OverrideDefinitions(
[
["--training.compile"],
[
f"--job.dump_folder {args.output_dir}/default/",
],
],
"Default",
),
OverrideDefinitions(
[
[
"--training.compile",
f"--job.dump_folder {args.output_dir}/1d_compile/",
],
],
"1D compile",
),
OverrideDefinitions(
[
["--training.tensor_parallel_degree 2 --model.norm_type=rmsnorm"],
[
"--training.tensor_parallel_degree 2 --model.norm_type=rmsnorm",
f"--job.dump_folder {args.output_dir}/eager_2d/",
],
],
"Eager mode 2DParallel",
),
OverrideDefinitions(
[
[
"--checkpoint.enable_checkpoint",
f"--checkpoint.folder {test_checkpoint_dir}_full_checkpoint",
f"--job.dump_folder {args.output_dir}/full_checkpoint/",
],
[
"--checkpoint.enable_checkpoint",
f"--checkpoint.folder {test_checkpoint_dir}_full_checkpoint",
f"--job.dump_folder {args.output_dir}/full_checkpoint/",
"--training.steps 20",
],
],
Expand All @@ -68,7 +85,7 @@ class OverrideDefinitions:
[
[
"--checkpoint.enable_checkpoint",
f"--checkpoint.folder {test_checkpoint_dir}_model_weights_only_fp32",
f"--job.dump_folder {args.output_dir}/model_weights_only_fp32/",
"--checkpoint.model_weights_only",
],
],
Expand All @@ -78,7 +95,7 @@ class OverrideDefinitions:
[
[
"--checkpoint.enable_checkpoint",
f"--checkpoint.folder {test_checkpoint_dir}_model_weights_only_bf16",
f"--job.dump_folder {args.output_dir}/model_weights_only_bf16/",
"--checkpoint.model_weights_only",
"--checkpoint.export_dtype bfloat16",
],
Expand All @@ -93,7 +110,9 @@ def run_test(test_flavor: OverrideDefinitions, full_path: str):
for override_arg in test_flavor.override_args:
cmd = f"CONFIG_FILE={full_path} NGPU=4 LOG_RANK=0,1,2,3 ./run_llama_train.sh"
if override_arg:
cmd += " " + " ".join(override_arg)
cmd += (
" " + " ".join(override_arg)
)
print(
f"=====Integration test, flavor : {test_flavor.test_descr}, command : {cmd}====="
)
Expand All @@ -118,15 +137,5 @@ def run_test(test_flavor: OverrideDefinitions, full_path: str):
config = tomllib.load(f)
is_integration_test = config["job"].get("use_for_integration_test", False)
if is_integration_test:
test_flavors = [OverrideDefinitions()] + integration_tests_flavors[
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OverrideDefinitions() from here got move to the top as 'Default' entry, but specified its own output dir.

config_file
]

for test_flavor in test_flavors:
for test_flavor in integration_tests_flavors[config_file]:
run_test(test_flavor, full_path)

# Deleting checkpoint folder from test
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure why we did this in the first place. The ckpts were already in separate dirs so they wouldn't interfere. And now that I am preserving output artifacts, it makes sense to keep them.

dir_list = glob.iglob(f"{test_checkpoint_dir}_*")
for path in dir_list:
if os.path.exists(path):
shutil.rmtree(path)
Loading