-
Notifications
You must be signed in to change notification settings - Fork 32
/
Copy pathMakefile
148 lines (129 loc) · 5.55 KB
/
Makefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
PROJECT?=pangeo-181919
NAMESPACE?=staging
RELEASE?=us-central1b-$(NAMESPACE)
ZONE?=us-central1-b
CLUSTER?=pangeo-uscentral1b
.PHONY: test mlflow-staging mlflow-prod
# Creation of the Kubernetes Cluster for Google
#
# Notes
# -----
# The pangeo-rbac stuff is untested.
#
# References
# ----------
# https://cloud.google.com/kubernetes-engine/docs/how-to/node-auto-provisioning
# https://cloud.google.com/kubernetes-engine/docs/how-to/workload-identity
# ----------------------------------------------------------------------------
# Kubernetes Cluster
#
#
# TODO: enable nodepool auto-provisioning
# --enable-autoprovisioning --min-cpu 1 --min-memory 1 --max-cpu 1000 --max-memory 5200 \
# --autoprovisioning-service-account=pangeo \
test:
@echo $(CLUSTER) $(WORKER_POOL)
@echo $(ARGS)
cluster:
echo "[Creating cluster pool in $(ZONE)]"
gcloud container clusters create $(CLUSTER) \
--num-nodes=1 \
--zone=$(ZONE) \
--labels=cluster=$(CLUSTER) \
--cluster-version=latest \
--no-enable-ip-alias \
--no-enable-legacy-authorization \
--no-enable-basic-auth \
--enable-autoupgrade --enable-autorepair --max-surge-upgrade=1 \
--machine-type=n1-standard-1 \
--enable-autoscaling --min-nodes=1 --max-nodes=2 \
--node-labels="hub.jupyter.org/node-purpose=core" \
--enable-autoprovisioning --autoprovisioning-config-file autoprovisioning.json \
--enable-vertical-pod-autoscaling \
--workload-metadata=GKE_METADATA \
--workload-pool=$(PROJECT).svc.id.goog \
--enable-stackdriver-kubernetes
# --autoprovisioning-service-account=pangeo # maybe need to do this after the initial setup
# --enable-ip-alias
# --enable-private-nodes
# --enable-private-endpoint
# --master-ipv4-cidr 172.16.0.32/28
gcloud container clusters get-credentials $(CLUSTER) --zone=$(ZONE)
# kubectl create namespace $(NAMESPACE)
# kubectl -n $(NAMESPACE) apply -f ../../pangeo-deploy/templates/pangeo-rbac.yaml
# gcloud iam service-accounts add-iam-policy-binding \
# --role roles/iam.workloadIdentityUser \
# --member "serviceAccount:$(PROJECT).svc.id.goog[$(NAMESPACE)/pangeo]" \
# kubectl annotate serviceaccount \
# --namespace $(NAMESPACE) \
# pangeo \
# iam.gke.io/gcp-service-account=pangeo@$(PROJECT).iam.gserviceaccount.com
# kubectl apply -f https://raw.githubusercontent.com/dask/dask-gateway/0.7.1/resources/helm/dask-gateway/crds/traefik.yaml -n $(NAMESPACE)
destroy-cluster:
echo "[Destroying cluster: $(CLUSTER)]"
gcloud container clusters delete $(CLUSTER)
nfs:
echo "[Creating nfs server in $(ZONE)]"
gcloud beta filestore instances create $(CLUSTER) --zone=$(ZONE) --tier=BASIC_HDD --file-share=name="home",capacity=1TB --network=name="default"
echo $(shell gcloud filestore instances describe $(CLUSTER) --zone=$(ZONE) --format json | jq ".networks[0].ipAddresses[0]")
destroy-nfs:
echo "[Destroying NFS server: $(CLUSTER)]"
gcloud filestore instances delete $(CLUSTER) --zone=$(ZONE)
pangeo:
helm upgrade --wait --install \
$(RELEASE) ../../pangeo-deploy \
--namespace=$(NAMESPACE) --version=0.1.0 \
-f ./config/common.yaml \
-f ./config/$(NAMESPACE).yaml \
-f ./secrets/$(NAMESPACE).yaml
serviceaccount:
# Assumes the GSA `pangeo` exists. Run `make google-service-account` if not.
gcloud iam service-accounts add-iam-policy-binding \
--role roles/iam.workloadIdentityUser \
--member "serviceAccount:$(PROJECT).svc.id.goog[$(NAMESPACE)/pangeo]" \
pangeo@$(PROJECT).iam.gserviceaccount.com
kubectl annotate serviceaccount \
--overwrite --namespace $(NAMESPACE) \
pangeo \
iam.gke.io/gcp-service-account=pangeo@$(PROJECT).iam.gserviceaccount.com
gcloud projects add-iam-policy-binding pangeo-181919 \
--member serviceAccount:pangeo@$(PROJECT).iam.gserviceaccount.com \
--role roles/serviceusage.serviceUsageConsumer
# MLFlow
gcloud iam service-accounts add-iam-policy-binding \
--role roles/iam.workloadIdentityUser \
--member "serviceAccount:$(PROJECT).svc.id.goog[$(NAMESPACE)/mlflow]" \
mlflow@$(PROJECT).iam.gserviceaccount.com
kubectl annotate serviceaccount \
--overwrite --namespace $(NAMESPACE) \
mlflow \
iam.gke.io/gcp-service-account=mlflow@$(PROJECT).iam.gserviceaccount.com
gcloud projects add-iam-policy-binding $(PROJECT) \
--member serviceAccount:mlflow@$(PROJECT).iam.gserviceaccount.com \
--role roles/serviceusage.serviceUsageConsumer
# TODO: how was IAM configured for pangeo?
gsutil iam ch serviceAccount:[email protected]:roles/storage.objectAdmin,objectViewer gs://pangeo-scratch
scratch:
gsutil lifecycle set lifecycle.json gs://pangeo-scratch
google-service-account:
gcloud iam service-accounts create pangeo
gcloud iam service-accounts create mlflow
# Adds MLFlow as a jupyterhub service
#
mlflow-staging:
helm upgrade --wait --install -n staging mlflow ../../mlflow/
mlflow-prod:
helm upgrade --wait --install -n prod mlflow ../../mlflow/
metrics:
helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
helm repo add grafana https://grafana.github.io/helm-charts
helm repo add ingress-nginx https://kubernetes.github.io/ingress-nginx
kubectl create namespace metrics
# Prometheus
helm upgrade --wait --install -n metrics prometheus prometheus-community/prometheus \
-f ../../metrics/prometheus-config.yaml
# nginx for ingress
helm upgrade --wait --install -n metrics ingress-nginx ingress-nginx/ingress-nginx -f ../../metrics/ingress-config.yaml
# Grafana
helm upgrade --wait --install -n metrics grafana grafana/grafana -f ../../metrics/grafana-config.yaml -f ../../metrics/grafana-config-gcp.yaml