Skip to content

Commit aa159ef

Browse files
Merge pull request #16 from determined-ai/mldm_28_mlde_267
fixed training images and BYOM
2 parents f72e82f + c3bed8d commit aa159ef

36 files changed

+87
-77
lines changed

bring-your-own-model/PDK_implementation/container/deploy/customer_churn_handler.py

+20-17
Original file line numberDiff line numberDiff line change
@@ -24,17 +24,17 @@ class CustomerChurnHandler(BaseHandler):
2424

2525
def __init__(self):
2626
super(CustomerChurnHandler, self).__init__()
27-
27+
2828
f = open("numscale.json")
2929
self.scale_dict = json.load(f)
3030
f.close()
3131

3232
def scale_data(self, df):
3333
for col in self.scale_dict:
3434
df[col] = (df[col] - self.scale_dict[col]["mean"]) / self.scale_dict[col]["std"]
35-
35+
3636
return df
37-
37+
3838
def encode_categories(self, df):
3939
expected_categories = {}
4040
expected_categories["new_cell"] = ['U','Y','N']
@@ -56,16 +56,16 @@ def encode_categories(self, df):
5656
expected_categories["kid11_15"] = ['U','Y']
5757
expected_categories["kid16_17"] = ['U','Y']
5858
expected_categories["creditcd"] = ['Y','N']
59-
59+
6060
for col in expected_categories:
6161
categorical_col = pd.Categorical(df[col], categories=expected_categories[col], ordered=False)
6262
one_hot_cols = pd.get_dummies(categorical_col, prefix=col)
6363
df.drop(col, axis=1, inplace=True)
6464
df = pd.concat([df, one_hot_cols], axis=1)
65-
65+
6666
return df
6767

68-
def preprocess(self, requests):
68+
def preprocess(self, data):
6969
"""
7070
Get the data from the JSON request in a dictionary, convert it to a pandas DataFrame.
7171
Then scale its numerical features using values from numscale.json, encode its categorical features,
@@ -77,25 +77,28 @@ def preprocess(self, requests):
7777
"""
7878

7979
# unpack the data
80-
data = requests[0].get('body')
81-
if data is None:
82-
data = requests[0].get('data')
83-
84-
df = pd.DataFrame.from_dict(data).reset_index(drop=True)
85-
logger.info('Successfully converted json/dict back to pandas DataFrame')
86-
80+
df_data = data[0]['data']
81+
df = pd.DataFrame.from_dict(df_data).reset_index(drop=True)
82+
logger.info('Successfully converted json/dict back to pandas DataFrame')
83+
8784
df = self.scale_data(df)
8885
logger.info('Numerical features successfully scaled')
89-
86+
9087
df = self.encode_categories(df)
9188
logger.info('Categorical features successfully encoded')
92-
89+
9390
feature_cols = list(df.columns)
9491
label_col = "churn"
9592
if label_col in feature_cols:
9693
feature_cols.remove(label_col)
97-
98-
input_tensor = torch.Tensor(df[feature_cols].values)
94+
95+
feature_values = df[feature_cols].values
96+
x = []
97+
for feature in feature_values:
98+
x.append(feature)
99+
100+
input_tensor = torch.Tensor(x)
101+
#input_tensor = torch.Tensor(df[feature_cols].values)
99102
logger.info('Dataframe successfully converted to tensor')
100103

101104
return input_tensor

bring-your-own-model/PDK_implementation/container/deploy/deploy.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ def create_mar_file(model_name, model_version):
7070

7171
def create_properties_file(model_name, model_version):
7272
config_properties = """inference_address=http://0.0.0.0:8085
73-
management_address=http://0.0.0.0:8081
73+
management_address=http://0.0.0.0:8083
7474
metrics_address=http://0.0.0.0:8082
7575
grpc_inference_port=7070
7676
grpc_management_port=7071

bring-your-own-model/PDK_implementation/container/deploy/requirements.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -8,4 +8,4 @@ kserve==0.9.0
88
determined==0.22.0
99
torch==1.13.1
1010
pandas==1.5.2
11-
scikit-learn
11+
scikit-learn

bring-your-own-model/PDK_implementation/pipelines/_on_prem_deployment-pipeline.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
"stdin": [
1919
"python deploy.py --k8s-config-file /determined_shared_fs/k8s.config --deployment-name customer-churn --service-account-name pach-deploy --resource-requests cpu=2,memory=4Gi --resource-limits cpu=10,memory=8Gi"
2020
],
21-
"image": "pachyderm/pdk:byom-deploy-v0.0.1",
21+
"image": "pachyderm/pdk:byom-deploy-v0.0.4",
2222
"secrets": [
2323
{
2424
"name": "pipeline-secret",

bring-your-own-model/PDK_implementation/pipelines/_on_prem_training-pipeline.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
"stdin": [
2020
"python train.py --git-url https://[email protected]:/determined-ai/pdk.git --git-ref main --sub-dir bring-your-own-model/PDK_implementation/experiment --config const.yaml --repo customer-churn-data --model customer-churn --project pdk-customer-churn"
2121
],
22-
"image": "pachyderm/pdk:train-v0.0.1",
22+
"image": "pachyderm/pdk:train-v0.0.4",
2323
"secrets": [
2424
{
2525
"name": "pipeline-secret",

bring-your-own-model/PDK_implementation/pipelines/deployment-pipeline.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
"stdin": [
1919
"python deploy.py --deployment-name customer-churn --cloud-model-host gcp --cloud-model-bucket pdk-repo-models --resource-requests cpu=2,memory=4Gi --resource-limits cpu=10,memory=8Gi"
2020
],
21-
"image": "us-central1-docker.pkg.dev/dai-dev-554/pdk-registry/pdk_customer_churn_deploy:2.2",
21+
"image": "pachyderm/pdk:byom-deploy-v0.0.4",
2222
"secrets": [
2323
{
2424
"name": "pipeline-secret",

bring-your-own-model/PDK_implementation/pipelines/training-pipeline.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
"stdin": [
2020
"python train.py --git-url https://[email protected]:/determined-ai/pdk.git --git-ref main --sub-dir bring-your-own-model/PDK_implementation/experiment --config const.yaml --repo customer-churn-data --model customer-churn --project pdk-customer-churn"
2121
],
22-
"image": "pachyderm/pdk:train-v0.0.1",
22+
"image": "pachyderm/pdk:train-v0.0.4",
2323
"secrets": [
2424
{
2525
"name": "pipeline-secret",

bring-your-own-model/readme.md

+22-10
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
# PDK - Pachyderm | Determined | KServe
66
## Bringing Your Model to PDK
7-
**Date/Revision:** August 30, 2023
7+
**Date/Revision:** January 02, 2024
88

99
In this section, we will train and deploy a simple customer churn model on PDK.
1010

@@ -72,25 +72,25 @@ data:
7272
 
7373
* Additionally, if the original experiment had a training length specified in number of epochs, it may be convenient to **define training length in number of batches instead** (the same applies for **min_validation_period**).
7474
* Indeed, the number of samples in the training set will now vary as new data gets committed to the MLDM repository, and knowing that number of samples is mandatory to define training length in number of epochs.
75-
* Note that the training pipeline image could be modified to deal with that issue, but specifying the training length in batches is a simple solution.
75+
* Note that the training pipeline image could be modified to deal with that issue, but specifying the training length in batches is a simpler solution.
7676
* Depending on the organization of the MLDE cluster where these automatically triggered experiments are expected to run, it may also be a good idea to **edit the workspace and project fields accordingly**.
7777

7878
 
7979

8080
### Step 1-2: Add code to download data from MLDM
81-
* In **startup-hook.sh**, install python-pachyderm.
82-
* In **data.py**, add the imports (_os_, _shutil_, _python-pachyderm_) that are required to define the two new functions to add: _safe_open_wb_, and _download_pach_repo_. The later one being used to download data from the MLDM repository.
83-
* **Note:** In this example, _download_pach_repo_ will only download files corresponding to the difference between current and last commit on the MLDM repository. It won't redownload and retrain on the initial *data_part1* if *data_part2* has been committed afterwards. You can change that behaviour by editing the _download_pach_repo_ function.
81+
* In **startup-hook.sh**, install `pachyderm-sdk`.
82+
* In **data.py**, add the imports (`os`, `shutil`, `python-pachyderm`) that are required to define the two new functions to add: `safe_open_wb`, and `download_pach_repo`. The later one being used to download data from the MLDM repository.
83+
* **Note:** In this example, `download_pach_repo` will only download files corresponding to the difference between current and last commit on the MLDM repository. It won't redownload and retrain on the initial *data_part1* if *data_part2* has been committed afterwards. You can change that behaviour by editing the `download_pach_repo` function.
8484
* In **model_def.py**:
85-
* Add _os_, _logging_ and _download_pach_repo_ as imports
85+
* Add `os`, `logging` and `download_pach_repo` as imports
8686
* In \_\__init___, check if the model is expected to be trained (which would require downloading data from the MLDM repository, building the training set and building the validation sets) or not.
87-
* Add the _download_data_ function, that will call the _download_pach_repo_ function to download files from the MLDM repository and return the list of those files.
87+
* Add the `download_data` function, that will call the `download_pach_repo` function to download files from the MLDM repository and return the list of those files.
8888

8989
### Step 1-3: Make sure the code handles the output of the _download_data_ function
9090

91-
The original code may not handle a list of files, as output by the _download_data_ function. In this example, in the base experiment, a single csv data file was expected, while a list of files can be expected with the PDK experiment. Depending on your original code, and how you expect your data to be committed to MLDM, this may or may not require changes.
91+
The original code may not handle a list of files, as output by the `download_data` function. In this example, in the base experiment, a single csv data file was expected, while a list of files can be expected with the PDK experiment. Depending on your original code, and how you expect your data to be committed to MLDM, this may or may not require changes.
9292

93-
In this example, the _get_train_and_validation_datasets_ function from **data.py** has been changed to concatenate a list of csv files into a single pandas DataFrame.
93+
In this example, the `get_train_and_validation_datasets` function from **data.py** has been changed to concatenate a list of csv files into a single pandas DataFrame.
9494

9595
## Step 2: Preparing MLDM and MLDE
9696

@@ -115,6 +115,18 @@ By default, we are using the same Workspace that was created in the deployment t
115115
det p create "PDK Demos" pdk-customer-churn
116116
```
117117

118+
### Step 2-3: Create the storage bucket folders
119+
120+
Create the following folder structure in the storage bucket (can be skipped for vanilla kubernetes deployments):
121+
122+
```bash
123+
customer-churn
124+
customer-churn/config
125+
customer-churn/model-store
126+
```
127+
128+
 
129+
118130

119131
 
120132
## Step 3: Create the training pipeline
@@ -132,7 +144,7 @@ In case this is not the case or if you want to dig deeper into the details, all
132144
* Name this MLDM pipeline by changing the _pipeline.name_.
133145
* Make sure the input repo matches the MLDM repository where data is expected to be committed.
134146
* Under _transform_:
135-
* Define the image to be used. The current image corresponds to files in the **container/train** folder and should work well as it is.
147+
* Define the image to be used. The current image configured in the pipeline should work well as it is.
136148
* _stdin_ command will be run when the pipeline is triggered. Make sure to change all the relevant options, in particular:
137149
* _--git-url_ to point to the Git URL containing the model code, since you probably want to change details in the experiment files.
138150
* _--sub-dir_ if the file structure of your git repository is different to this one.

deploy/README.md

+3-7
Original file line numberDiff line numberDiff line change
@@ -152,9 +152,7 @@ Also, a Worspace and Project were configured for this experiment. You can change
152152

153153
 
154154

155-
**Important**: The default setting for the examples included here is to run on the *gpu-pool* resource pool. If your MLDE instance does not have a resource pool called *gpu-pool*, the experiments will fail to run. Make sure to modify the experiment files as needed.
156-
157-
Also, don't forget to create a Workspace and a Project in MLDE with the same name as configured in the file; otherwise, the experiment will fail to run. This can be done in the Workspaces page in the UI.
155+
Don't forget to create a Workspace and a Project in MLDE with the same name as configured in the file; otherwise, the experiment will fail to run. This can be done in the Workspaces page in the UI.
158156

159157
![alt text][github_03_workspaces]
160158

@@ -192,8 +190,6 @@ A brief description of the Experiment files:
192190

193191
The experiment files don't need to be modified, except for the Workspace and Project name in the `const.yaml` file. Do keep in mind that, at runtime, the pipeline will pull this code from Github. Any changes to any of the files need to be uploaded to your repository.
194192

195-
196-
197193
 
198194

199195
### MLDM Images
@@ -375,7 +371,7 @@ In the Training pipeline file, change the command line to point to your github r
375371
"stdin": [
376372
"python train.py --git-url https://[email protected]:/determined-ai/pdk.git --git-ref main --sub-dir examples/dog-cat/experiment --config const.yaml --repo dogs-and-cats-data --model dogs-and-cats --project pdk-dogs-and-cats"
377373
],
378-
"image": "pachyderm/pdk:train-v0.0.1",
374+
"image": "pachyderm/pdk:train-v0.0.3",
379375
```
380376

381377

@@ -438,7 +434,7 @@ Also, replace the path to your image, or use the default value.
438434
"stdin": [
439435
"python deploy.py --deployment-name dog-cat --cloud-model-host gcp --cloud-model-bucket pdk-repo-models --resource-requests cpu=2,memory=8Gi --resource-limits cpu=10,memory=8Gi"
440436
],
441-
"image": "pachyderm/pdk:dog-cat-deploy-v0.0.1",
437+
"image": "pachyderm/pdk:dog-cat-deploy-v0.0.3",
442438
```
443439
 
444440

deploy/deploy_aws.md

+14-5
Original file line numberDiff line numberDiff line change
@@ -654,6 +654,14 @@ provisioner: kubernetes.io/aws-ebs
654654
parameters:
655655
type: gp3
656656
fsType: ext4
657+
volumeBindingMode: WaitForFirstConsumer
658+
allowedTopologies:
659+
- matchLabelExpressions:
660+
- key: failure-domain.beta.kubernetes.io/zone
661+
values:
662+
- ${AWS_AVAILABILITY_ZONE_1}
663+
- ${AWS_AVAILABILITY_ZONE_2}
664+
- ${AWS_AVAILABILITY_ZONE_3}
657665
EOF
658666
```
659667

@@ -946,7 +954,8 @@ To create the databases using the psql pod, use these commands:
946954
```bash
947955
kubectl run psql -it --rm=true --image=postgres:13 --command -- psql -h ${RDS_CONNECTION_URL} -U postgres postgres
948956

949-
# The prompt will freeze as it waits for the password. Type the password and press enter.
957+
# The prompt will freeze as it loads the pod. Wait for the message "If you don't see a command prompt, try pressing enter".
958+
# Then, type the password and press enter.
950959

951960
postgres=> CREATE DATABASE pachyderm;
952961

@@ -1041,7 +1050,7 @@ After running this command, wait about 10 minutes for all the services to be pro
10411050

10421051
As of MLDM version 2.8.1, a single Helm chart can be used to deploy both MLDM and MDLE.
10431052

1044-
Because we're using the AWS buckets, there are 2 service accounts in the MLDM namespace that will need access to S3: the main MLDM service account and the `worker` MLDM service account, which runs the pipeline code.
1053+
Because we're using the AWS buckets, there are 2 service accounts that will need access to S3: the main MLDM service account and the `worker` MLDM service account, which runs the pipeline code.
10451054

10461055
The EKS installation command created the necessary roles with the right permissions, all we need to do is configure the service account to leverage those roles. Run these commands to set the proper ARNs for the roles:
10471056

@@ -1375,8 +1384,6 @@ pachctl list commit images
13751384

13761385
pachctl create pipeline -f https://raw.githubusercontent.com/pachyderm/pachyderm/2.6.x/examples/opencv/edges.json
13771386

1378-
1379-
13801387
wget http://imgur.com/8MN9Kg0.png
13811388

13821389
pachctl put file images@master:AT-AT.png -f 8MN9Kg0.png
@@ -1394,6 +1401,8 @@ pachctl list job
13941401

13951402
 
13961403

1404+
PS: If you used the default image size for the CPU nodes, the new pipelines may fail at first due to lack of available CPUs. In this case, the autoscaler should automatically add a new node to the CPU node group. Once the new CPUs are available, the pipeline should start automatically.
1405+
13971406
At this time, you should see the OpenCV project and pipeline in the MLDM UI:
13981407

13991408

@@ -1649,7 +1658,7 @@ A more detailed explanation of these attributes:
16491658

16501659
 
16511660

1652-
This secret needs to be created in the MLDM namespace, as it will be used by the pipelines (that will then map the variables to the MLDE experiment):
1661+
This secret will be used by the pipelines, to map the variables for the MLDE experiments:
16531662

16541663
```bash
16551664
kubectl apply -f pipeline-secret.yaml

examples/brain-mri/experiment/const.yaml

-3
Original file line numberDiff line numberDiff line change
@@ -32,9 +32,6 @@ searcher:
3232
epochs: 4
3333
min_validation_period:
3434
epochs: 1
35-
resources:
36-
resource_pool: gpu-pool
37-
slots_per_trial: 1
3835
max_restarts: 0
3936
entrypoint: model_def:MRIUnetTrial
4037
profiling:

examples/brain-mri/pipelines/_on_prem_deployment-pipeline.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
"stdin": [
2020
"python deploy.py --deployment-name brain-mri-deploy --service-account-name pach-deploy --resource-requests cpu=2,memory=4Gi --resource-limits cpu=4,memory=4Gi"
2121
],
22-
"image": "us-central1-docker.pkg.dev/dai-dev-554/pdk-registry/pdk_brain_mri_deploy:3.2",
22+
"image": "pachyderm/pdk:brain-deploy-v0.0.4",
2323
"secrets": [
2424
{
2525
"name": "pipeline-secret",

examples/brain-mri/pipelines/_on_prem_training-pipeline.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
"stdin": [
2121
"python train.py --git-url https://[email protected]:/determined-ai/pdk.git --git-ref main --sub-dir examples/brain-mri/experiment --config const.yaml --repo brain-mri-data --model brain-mri --project pdk-brain-mri"
2222
],
23-
"image": "pachyderm/pdk:train-v0.0.1",
23+
"image": "pachyderm/pdk:train-v0.0.4",
2424
"secrets": [
2525
{
2626
"name": "pipeline-secret",

examples/brain-mri/pipelines/deployment-pipeline.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
"stdin": [
1919
"python deploy.py --deployment-name brain-mri-deploy --cloud-model-host gcp --cloud-model-bucket pdk-repo-models --resource-requests cpu=2,memory=8Gi --resource-limits cpu=10,memory=8Gi"
2020
],
21-
"image": "us-central1-docker.pkg.dev/dai-dev-554/pdk-registry/pdk_brain_mri_deploy:3.2",
21+
"image": "pachyderm/pdk:brain-deploy-v0.0.4",
2222
"secrets": [
2323
{
2424
"name": "pipeline-secret",

examples/brain-mri/pipelines/training-pipeline.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
"stdin": [
2020
"python train.py --git-url https://[email protected]:/determined-ai/pdk.git --git-ref main --sub-dir examples/brain-mri/experiment --config const.yaml --repo brain-mri-data --model brain-mri --project pdk-brain-mri"
2121
],
22-
"image": "pachyderm/pdk:train-v0.0.1",
22+
"image": "pachyderm/pdk:train-v0.0.4",
2323
"secrets": [
2424
{
2525
"name": "pipeline-secret",

examples/dog-cat/pipelines/_on_prem_deployment-pipeline.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
"stdin": [
2020
"python deploy.py --deployment-name dogcat-deploy --service-account-name pach-deploy --resource-requests cpu=2,memory=4Gi --resource-limits cpu=4,memory=4Gi"
2121
],
22-
"image": "us-central1-docker.pkg.dev/dai-dev-554/pdk-registry/pdk_cats_dogs_deploy:2.1",
22+
"image": "pachyderm/pdk:dog-cat-deploy-v0.0.4",
2323
"secrets": [
2424
{
2525
"name": "pipeline-secret",

examples/dog-cat/pipelines/_on_prem_training-pipeline.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
"stdin": [
2121
"python train.py --git-url https://[email protected]:/determined-ai/pdk.git --git-ref main --sub-dir examples/dog-cat/experiment --config const.yaml --repo dogs-and-cats-data --model dogs-and-cats --project pdk-dogs-and-cats"
2222
],
23-
"image": "pachyderm/pdk:train-v0.0.1",
23+
"image": "pachyderm/pdk:train-v0.0.4",
2424
"secrets": [
2525
{
2626
"name": "pipeline-secret",

examples/dog-cat/pipelines/deployment-pipeline.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
"stdin": [
1919
"python deploy.py --deployment-name dog-cat --cloud-model-host gcp --cloud-model-bucket pdk-repo-models --resource-requests cpu=2,memory=8Gi --resource-limits cpu=10,memory=8Gi"
2020
],
21-
"image": "us-central1-docker.pkg.dev/dai-dev-554/pdk-registry/pdk_cats_dogs_deploy:2.1",
21+
"image": "pachyderm/pdk:dog-cat-deploy-v0.0.4",
2222
"secrets": [
2323
{
2424
"name": "pipeline-secret",

examples/dog-cat/pipelines/training-pipeline.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
"stdin": [
2020
"python train.py --git-url https://[email protected]:/determined-ai/pdk.git --git-ref main --sub-dir examples/dog-cat/experiment --config const.yaml --repo dogs-and-cats-data --model dogs-and-cats --project pdk-dogs-and-cats"
2121
],
22-
"image": "pachyderm/pdk:train-v0.0.1",
22+
"image": "pachyderm/pdk:train-v0.0.4",
2323
"secrets": [
2424
{
2525
"name": "pipeline-secret",

examples/object-detection/experiment/const-distributed-search.yaml

-1
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,6 @@ searcher:
5151
batches: 51520 # 50*1288 == 51520# Real Training
5252
records_per_epoch: 1288
5353
resources:
54-
resource_pool: gpu-pool
5554
slots_per_trial: 8
5655
shm_size: 2000000000
5756
max_restarts: 0

0 commit comments

Comments
 (0)