Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/main'
Browse files Browse the repository at this point in the history
  • Loading branch information
dchourasia committed Feb 28, 2025
2 parents c132a8a + bac8162 commit 62c8665
Show file tree
Hide file tree
Showing 7 changed files with 464 additions and 216 deletions.
3 changes: 2 additions & 1 deletion images/runtime/training/cuda/Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ torch = "==2.4.1"
sentencepiece = "<0.3,>=0.1.99"
tokenizers = "<1.0,>=0.13.3"
tqdm = "<5.0,>=4.66.2"
trl = ">=0.15.1"
trl = ">=0.15.2"
protobuf = "<6.0.0,>=5.28.0"
simpleeval = "<1.0,>=0.9.13"
safetensors = "*"
Expand All @@ -27,6 +27,7 @@ pydantic = ">=2.7.0"
deepspeed = ">=0.14.3"
aiofiles = ">=23.2.1"
async-timeout = "==4.0.3"
tensorboard = "2.19.0"

[dev-packages]

Expand Down
314 changes: 212 additions & 102 deletions images/runtime/training/cuda/Pipfile.lock

Large diffs are not rendered by default.

13 changes: 13 additions & 0 deletions images/runtime/training/rocm/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,19 @@ RUN micropipenv install && \
chmod -R g+w /opt/app-root/lib/python3.11/site-packages && \
fix-permissions /opt/app-root -P

# Install Flash Attention
ENV GPU_ARCHS=gfx90a;gfx941;gfx942

RUN pip install wheel ninja

RUN export TMP_DIR=$(mktemp -d) \
&& cd $TMP_DIR \
&& git clone --depth 1 --branch v2.7.4 https://github.com/Dao-AILab/flash-attention.git \
&& cd flash-attention \
&& git submodule update --init
&& MAX_JOBS="16" python3 setup.py install --verbose \
&& rm -rf $TMP_DIR

# Restore user workspace
USER 1001

Expand Down
3 changes: 2 additions & 1 deletion images/runtime/training/rocm/Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ pytorch-triton-rocm = {version = "*", index = "pytorch"}
sentencepiece = "<0.3,>=0.1.99"
tokenizers = "<1.0,>=0.13.3"
tqdm = "<5.0,>=4.66.2"
trl = ">=0.15.1"
trl = ">=0.15.2"
protobuf = "<6.0.0,>=5.28.0"
simpleeval = "<1.0,>=0.9.13"
safetensors = "*"
Expand All @@ -33,6 +33,7 @@ pydantic = ">=2.7.0"
deepspeed = ">=0.14.3"
aiofiles = ">=23.2.1"
async-timeout = "==4.0.3"
tensorboard = "2.19.0"

[dev-packages]

Expand Down
314 changes: 212 additions & 102 deletions images/runtime/training/rocm/Pipfile.lock

Large diffs are not rendered by default.

12 changes: 7 additions & 5 deletions tests/kfto/kfto_mnist_sdk_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ package kfto
import (
"strings"
"testing"
"time"

. "github.com/onsi/gomega"
. "github.com/project-codeflare/codeflare-common/support"
Expand All @@ -41,10 +42,11 @@ func TestMnistSDK(t *testing.T) {
CreateUserRoleBindingWithClusterRole(test, userName, namespace.Name, "admin")

requiredChangesInNotebook := map[string]string{
"${api_url}": GetOpenShiftApiUrl(test),
"${password}": userToken,
"${num_gpus}": "0",
"${namespace}": namespace.Name,
"${api_url}": GetOpenShiftApiUrl(test),
"${password}": userToken,
"${num_gpus}": "0",
"${namespace}": namespace.Name,
"${training_image}": GetCudaTrainingImage(),
}

jupyterNotebook := string(readFile(test, "resources/mnist_kfto.ipynb"))
Expand Down Expand Up @@ -81,7 +83,7 @@ func TestMnistSDK(t *testing.T) {
Should(WithTransform(PyTorchJobConditionRunning, Equal(v1.ConditionTrue)))

// Make sure that the job eventually succeeds
test.Eventually(PyTorchJob(test, namespace.Name, "pytorch-ddp")).
test.Eventually(PyTorchJob(test, namespace.Name, "pytorch-ddp"), TestTimeoutLong, 1*time.Second).
Should(WithTransform(PyTorchJobConditionSucceeded, Equal(v1.ConditionTrue)))

// TODO: write torch job logs?
Expand Down
21 changes: 16 additions & 5 deletions tests/kfto/resources/mnist_kfto.ipynb
Original file line number Diff line number Diff line change
@@ -1,5 +1,15 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "ebdb3af3",
"metadata": {},
"outputs": [],
"source": [
"%pip install -U kubeflow-training"
]
},
{
"cell_type": "code",
"execution_count": 6,
Expand All @@ -19,7 +29,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": null,
"id": "72dd1751",
"metadata": {},
"outputs": [],
Expand All @@ -28,7 +38,8 @@
"num_gpus = \"${num_gpus}\"\n",
"openshift_api_url = \"${api_url}\"\n",
"namespace = \"${namespace}\"\n",
"token = \"${password}\""
"token = \"${password}\"\n",
"training_image= \"${training_image}\""
]
},
{
Expand Down Expand Up @@ -57,9 +68,9 @@
" train_func=train_func,\n",
" num_workers=1,\n",
" resources_per_worker={\"gpu\": num_gpus},\n",
" base_image=\"quay.io/kpostlet/torch-train:with-minivision\",\n",
" # packages_to_install=[\"torchvision==0.19.0\", \"--target=/tmp/lib\"],\n",
" # env_vars={\"PYTHONPATH\": \"/tmp/lib:$PYTHONPATH\", \"NCCL_DEBUG\": \"INFO\", \"TORCH_DISTRIBUTED_DEBUG\": \"DETAIL\"}\n",
" base_image=training_image,\n",
" packages_to_install=[\"torchvision==0.19.0\",\"minio==7.2.13\", \"--target=/tmp/lib\"],\n",
" env_vars={\"PYTHONPATH\": \"/tmp/lib:$PYTHONPATH\", \"NCCL_DEBUG\": \"INFO\", \"TORCH_DISTRIBUTED_DEBUG\": \"DETAIL\"}\n",
")"
]
},
Expand Down

0 comments on commit 62c8665

Please sign in to comment.