-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathdaliuge-ray.yaml
193 lines (163 loc) · 8.56 KB
/
daliuge-ray.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
# This is the main configuration file for the DALiuGE RAY cluster deployment on AWS
# It is using a custom build docker image combining both Ray and DALiuGE into one
# image.
#
# An unique identifier for the head node and workers of this cluster.
cluster_name: default
# The minimum number of workers nodes to launch in addition to the head
# node. This number should be >= 0.
min_workers: 1
# The maximum number of workers nodes to launch in addition to the head
# node. This takes precedence over min_workers.
max_workers: 2
# The initial number of worker nodes to launch in addition to the head
# node. When the cluster is first brought up (or when it is refreshed with a
# subsequent `ray up`) this number of nodes will be started.
initial_workers: 1
# Whether or not to autoscale aggressively. If this is enabled, if at any point
# we would start more workers, we start at least enough to bring us to
# initial_workers.
autoscaling_mode: default
# This executes all commands on all nodes in the docker container,
# and opens all the necessary ports to support the Ray cluster.
# Empty string means disabled.
docker:
image: "icrar/dlg_ray:1.5"
# image: "rayproject/ray:latest-cpu" # You can change this to latest-cpu if you don't need GPU support and want a faster startup
container_name: "ray_container"
# If true, pulls latest version of image. Otherwise, `docker run` will only pull the image
# if no cached version is present.
pull_before_run: False
run_options: [-v /var/run/docker.sock:/var/run/docker.sock -p 8265:8265] # Extra options to pass into "docker run"
# Example of running a GPU head with CPU workers
# head_image: "rayproject/ray:latest-gpu"
# Allow Ray to automatically detect GPUs
worker_image: "icrar/dlg_ray:1.5"
worker_run_options: [-v /var/run/docker.sock:/var/run/docker.sock -p 8265:8265]
# The autoscaler will scale up the cluster to this target fraction of resource
# usage. For example, if a cluster of 10 nodes is 100% busy and
# target_utilization is 0.8, it would resize the cluster to 13. This fraction
# can be decreased to increase the aggressiveness of upscaling.
# This max value allowed is 1.0, which is the most conservative setting.
target_utilization_fraction: 0.8
# If a node is idle for this many minutes, it will be removed.
idle_timeout_minutes: 5
# Cloud-provider specific configuration.
provider:
type: aws
region: ap-southeast-2
# Availability zone(s), comma-separated, that nodes may be launched in.
# Nodes are currently spread between zones by a round-robin approach,
# however this implementation detail should not be relied upon.
availability_zone: ap-southeast-2a,ap-southeast-2b
# Whether to allow node reuse. If set to False, nodes will be terminated
# instead of stopped.
cache_stopped_nodes: False # If not present, the default is True.
# How Ray will authenticate with newly launched nodes.
auth:
ssh_user: ubuntu
# By default Ray creates a new private keypair, but you can also use your own.
# If you do so, make sure to also set "KeyName" in the head and worker node
# configurations below.
# ssh_private_key: /path/to/your/key.pem
# Provider-specific config for the head node, e.g. instance type. By default
# Ray will auto-configure unspecified fields such as SubnetId and KeyName.
# For more documentation on available fields, see:
# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
head_node:
InstanceType: t3a.medium
# ImageId: ami-076e39b6b14e3bb20 # Amazon 64 bit
ImageId: ami-01bf596d1dc4da556 # Ubuntu Deep learning
# You can provision additional disk space with a conf as follows
BlockDeviceMappings:
- DeviceName: /dev/sda1
Ebs:
VolumeSize: 100
# Additional options in the boto docs.
# Provider-specific config for worker nodes, e.g. instance type. By default
# Ray will auto-configure unspecified fields such as SubnetId and KeyName.
# For more documentation on available fields, see:
# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
worker_nodes:
InstanceType: t3a.medium
# ImageId: ami-076e39b6b14e3bb20 # Amazon 64 bit
ImageId: ami-01bf596d1dc4da556 # Ubuntu Deep learning
# Run workers on spot by default. Comment this out to use on-demand.
# InstanceMarketOptions:
# MarketType: spot
# Additional options can be found in the boto docs, e.g.
# SpotOptions:
# MaxPrice: MAX_HOURLY_PRICE
# Additional options in the boto docs.
# Files or directories to copy to the head and worker nodes. The format is a
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
file_mounts: {
# NOTE: the source directory is a local directory on the machine 'ray up' is called.
"/var/dlg_home":"/var/dlg_home"
}
# Files or directories to copy from the head node to the worker nodes. The format is a
# list of paths. The same path on the head node will be copied to the worker node.
# This behavior is a subset of the file_mounts behavior. In the vast majority of cases
# you should just use file_mounts. Only use this if you know what you're doing!
cluster_synced_files: ["/var/dlg_home/code"]
# Whether changes to directories in file_mounts or cluster_synced_files in the head node
# should sync to the worker node continuously
file_mounts_sync_continuously: False
# Patterns for files to exclude when running rsync up or rsync down
#rsync_exclude:
# - "**/.git"
# - "**/.git/**"
# Pattern files to use for filtering out files when running rsync up or rsync down. The file is searched for
# in the source directory and recursively through all subdirectories. For example, if .gitignore is provided
# as a value, the behavior will match git's behavior for finding and using .gitignore files.
#rsync_filter:
# - ".gitignore"
# List of commands that will be run before `setup_commands`. If docker is
# enabled, these commands will run outside the container and before docker
# is setup.
initialization_commands: [
# this link is very important, as it enables docker components running on DALiuGE to
# see the working directory on the same path as the direct components.
sudo ln -Fs /tmp/ray_tmp_mount/default/var/dlg_home /var/dlg_home
]
# List of shell commands to run to set up nodes.
setup_commands:
- sudo apt update
- sudo apt install -y curl
# next two lines are working around a problem with permissions on ray. We need to get
# access to the docker socket from inside the ray container as user ray. But inside
# the container there is not even a docker group, only the socket is visible with a
# numberic group from the host.
- g=$(ls -g /var/run/docker.sock | awk '{print $3}') && if [[ $g =~ ^[0-9]+$ ]] ; then sudo groupadd -g $g docker; fi;
- sudo usermod -a -G docker ray
# Custom commands that will be run on the head node after common setup.
head_setup_commands: []
# Custom commands that will be run on worker nodes after common setup.
worker_setup_commands:
- sudo apt update
- sudo apt install -y curl
- g=$(ls -g /var/run/docker.sock | awk '{print $3}') && if [[ $g =~ ^[0-9 ]] ; then sudo groupadd -g $g docker; fi;
- sudo usermod -a -G docker ray
# In order to get the leap_cli docker container mounting the /var/dlg_home directory correctly the leap_cli
# docker component needs to specify the following in the additional mounts parameter:
#
# /tmp/ray_tmp_mount/var/dlg_home/workspace:/var/dlg_home
#
# and on the ray hosts there need to be links:
#
# sudo ln -s /tmp/ray_tmp_mount/var/dlg_home /var/dlg_home
# Command to start ray on the head node. You don't need to change this.
head_start_ray_commands:
- ray stop
- ulimit -n 65536; ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml
# start DALiuGE data island manager
- dlg dim -d -vvv -H 0.0.0.0 -w /var/dlg_home/workspace
- dlg nm -d -vvv -H 0.0.0.0 --dlg-path=/var/dlg_home/code -w /var/dlg_home/workspace
- IP_ADDR="`python -c 'import ray; print(ray._private.services.get_node_ip_address())'`"; curl -X POST http://localhost:8001/api/nodes/$IP_ADDR
# Command to start ray on worker nodes. You don't need to change this.
worker_start_ray_commands:
- ray stop
- ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076
# start DALiuGE node manager
- dlg nm -vvvd -H 0.0.0.0 --dlg-path=/var/dlg_home/code -w /var/dlg_home/workspace
- IP_ADDR="`python -c 'import ray; print(ray._private.services.get_node_ip_address())'`"; curl -X POST http://$RAY_HEAD_IP:8001/api/nodes/$IP_ADDR