-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathgolem-cluster.yaml
139 lines (109 loc) · 4.82 KB
/
golem-cluster.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
# This is Ray on Golem Example yaml
# - the example config for testnet cluster,
# - ready for easy switch to mainnet
# - with properties allowing fine-tuning
# Ray on Golem cluster name
cluster_name: "golem-cluster"
# The maximum number of workers the cluster will have at any given time
max_workers: 10
# The number of minutes that need to pass before an idle worker node is removed by the Autoscaler
idle_timeout_minutes: 5
# The cloud provider-specific configuration properties.
provider:
type: "external"
use_internal_ips: true
module: "ray_on_golem.provider.node_provider.GolemNodeProvider"
parameters:
# Blockchain used for payments.
# `holesky` means running free nodes on testnet,
# `polygon` is for mainnet operations.
payment_network: "holesky"
#payment_network: "polygon"
# Maximum amount of GLMs that's going to be spent for the whole cluster
total_budget: 5
# Common parameters for all node types. Can be overridden in available_node_types
node_config:
# Parameters for golem demands (same for head and workers)
demand:
# Check available versions at https://registry.golem.network/explore/golem/ray-on-golem
image_tag: "golem/ray-on-golem:0.13.0-py3.10.13-ray2.9.3"
# List of urls which will be added to the Computation Manifest
# Requires protocol to be defined in all URLs
# If not provided demand will not use Computation Manifest
outbound_urls: ["https://pypi.dev.golem.network"]
# Minimal values of parameters offered by Golem provider
#min_mem_gib: 1
#min_cpu_threads: 1
#min_storage_gib: 1
# Maximal values of parameters offered by Golem provider
#max_cpu_threads: null
budget_control:
per_cpu_expected_usage:
# Per cpu expected cost is calculated as a sum of:
# - start_price / cpu_count
# - env_per_hour_price * duration_hours / cpu_count
# - cpu_per_hour_price * duration_hours * cpu_load
# Estimated expected load and duration for worker that tells budget control to pick the least expensive Golem provider offers first.
# If not provided, offers will be picked at random.
cpu_load: 0.8
duration_hours: 0.5 # 30 minutes
# Amount of GLMs for expected usage which Golem provider offer will be rejected if exceeded.
max_cost: 1.5
# Amount of GLMs for worker initiation which Golem provider offer will be rejected if exceeded.
max_start_price: 0.5
# Amount of GLMs for CPU utilisation per hour which Golem provider offer will be rejected if exceeded.
max_cpu_per_hour_price: 0.5
# Amount of GLMs for each hour that worker runs which Golem provider offer will be rejected if exceeded.
max_env_per_hour_price: 0.5
# Tells the autoscaler the allowed node types and the resources they provide
available_node_types:
ray.head.default:
# The minimum number of worker nodes of this type to launch
#min_workers: 0
# The maximum number of worker nodes of this type to launch
#max_workers: 0
# The node type's CPU and GPU resources - leave it empty for autodetection
resources: {}
# Additional parameters specific for this node type added on top of node_config from provider.parameters.node_config
node_config: {}
#node_config:
# demand:
# min_mem_gib: 10
ray.worker.default:
min_workers: 1
max_workers: 10
resources: {}
node_config: {}
# The files or directories to copy to the head and worker nodes
# Remote workdir is /root/
file_mounts: {
# <remote_path>: <local_path>
# "/absolute/path/dir/": ".",
# "./relative/path/dir/": ".",
# "./relative/path/file.txt": "./file.txt"
}
# A list of paths to the files or directories to copy from the head node to the worker nodes
cluster_synced_files: []
# List of commands that will be run to initialize the nodes (before `setup_commands`)
initialization_commands: []
#initialization_commands: [
# "pip install endplay",
#]
# List of shell commands to run to set up nodes
setup_commands: []
# Custom commands that will be run on the head node after common setup.
head_setup_commands: []
# Custom commands that will be run on worker nodes after common setup.
worker_setup_commands: []
# Command to start ray on the head node. You don't need to change this.
head_start_ray_commands: [
"ray start --head --node-ip-address $NODE_IP --include-dashboard=True --dashboard-host 0.0.0.0 --disable-usage-stats --autoscaling-config=~/ray_bootstrap_config.yaml",
]
# Command to start ray on worker nodes. You don't need to change this.
worker_start_ray_commands: [
"ray start --address $RAY_HEAD_IP:6379",
]
# Satisfy checks to disable warning about legacy fields at `ray up`.
# This will be removed by ray-on-golem on the fly.
head_node: true
worker_nodes: true