-
Notifications
You must be signed in to change notification settings - Fork 445
/
Copy pathray-cluster.tpu-v4-singlehost.yaml
109 lines (108 loc) · 3.1 KB
/
ray-cluster.tpu-v4-singlehost.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
# This template contains a Kuberay cluster using a 2x2x1 TPU v4 PodSlice.
# To get access to TPU resources, please follow instructions in this link:
# https://cloud.google.com/kubernetes-engine/docs/how-to/tpus
apiVersion: ray.io/v1
kind: RayCluster
metadata:
# Label required for TPU webhook to initialize environments.
labels:
app.kubernetes.io/name: kuberay
name: example-cluster-kuberay
spec:
headGroupSpec:
rayStartParams:
{}
template:
spec:
imagePullSecrets:
[]
containers:
- volumeMounts:
- mountPath: /tmp/ray
name: ray-logs
name: ray-head
image: rayproject/ray:2.9.0-py310
imagePullPolicy: IfNotPresent
resources:
limits:
cpu: "8"
ephemeral-storage: 20Gi
memory: 40G
requests:
cpu: "8"
ephemeral-storage: 10Gi
memory: 40G
securityContext:
{}
env:
- name: RAY_memory_monitor_refresh_ms
value: "0"
- name: RAY_GRAFANA_IFRAME_HOST
value: http://${grafana_host}
- name: RAY_GRAFANA_HOST
value: http://grafana:80
- name: RAY_PROMETHEUS_HOST
value: http://frontend:9090
ports:
- containerPort: 6379
name: gcs
- containerPort: 8265
name: dashboard
- containerPort: 10001
name: client
- containerPort: 8000
name: serve
volumes:
- emptyDir: {}
name: ray-logs
metadata:
labels:
cloud.google.com/gke-ray-node-type: head
app.kubernetes.io/name: kuberay
app.kubernetes.io/instance: example-cluster
workerGroupSpecs:
- rayStartParams:
{}
replicas: 1
minReplicas: 1
maxReplicas: 1
numOfHosts: 1
groupName: workergroup
template:
spec:
imagePullSecrets:
[]
containers:
- volumeMounts:
- mountPath: /tmp/ray
name: ray-logs
name: ray-worker
image: rayproject/ray:2.9.0-py310
imagePullPolicy: IfNotPresent
resources:
limits:
cpu: "1"
ephemeral-storage: 20Gi
google.com/tpu: "4"
memory: 40G
requests:
cpu: "1"
ephemeral-storage: 10Gi
google.com/tpu: "4"
memory: 40G
securityContext:
{}
env:
ports:
null
volumes:
- emptyDir: {}
name: ray-logs
nodeSelector:
cloud.google.com/gke-tpu-accelerator: tpu-v4-podslice
cloud.google.com/gke-tpu-topology: 2x2x1
metadata:
labels:
cloud.google.com/gke-ray-node-type: worker
app.kubernetes.io/name: kuberay
app.kubernetes.io/instance: example-cluster