forked from scicore-unibas-ch/ansible-role-slurm
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathslurm_suspend_openstack.py.j2
101 lines (79 loc) · 3.9 KB
/
slurm_suspend_openstack.py.j2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
#!{{ slurm_openstack_venv_path }}/bin/python3
# -*- coding: utf-8 -*-
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
""" A Slurm SuspendProgram to delete OpenStack instances.
Usage:
suspend HOSTLIST_EXPRESSION
where:
HOSTLIST_EXPRESSION: Name(s) of node(s) to create, using Slurm's hostlist expression, as per [1].
If a file with the nodename exists in the Slurm control daemons spool directory [2] then the OpenStack ID is read from it and used to select the instance to delete.
Otherwise, this will attempt to delete the instance by name which requires that the name is unique.
OpenStack credentials must be available to this script (e.g. via an application credential in /etc/openstack/clouds.yaml readable by the slurm user)
Output and exceptions are written to the syslog.
[1]: https://slurm.schedmd.com/slurm.conf.html#OPT_SuspendProgram
[2]: https://slurm.schedmd.com/slurm.conf.html#OPT_SlurmdSpoolDir
"""
import sys, os, subprocess, logging, logging.handlers
import openstack
import pprint
# configure logging to syslog - by default only "info" and above categories appear
logger = logging.getLogger("syslogger")
logger.setLevel(logging.DEBUG)
handler = logging.handlers.SysLogHandler("/dev/log")
handler.setFormatter(logging.Formatter(sys.argv[0] + ': %(message)s'))
logger.addHandler(handler)
def main():
hostlist_expr = sys.argv[1]
logger.info(f"Slurmctld invoked suspend node {hostlist_expr}")
remove_nodes = expand_nodes(hostlist_expr)
try:
conn = openstack.connection.from_config()
logger.info(f"Got openstack connection {conn}")
except Exception as e:
logger.error(f"Error connecting to OpenStack API: {e}")
for node in remove_nodes:
instance_id = False
statedir = get_statesavelocation()
instance_file = os.path.join(statedir, node)
try:
with open(instance_file) as f:
instance_id = f.readline().strip()
except FileNotFoundError:
logger.info(f"no instance file found in {statedir} for node {node}")
#logger.info(f"deleting node {instance_id or node}")
logger.info(f"Deleting node {node} with ID {instance_id}")
delete_server(conn, (instance_id or node))
# updating the node ip using scontrol is not needed when using "SlurmctldParameters=cloud_reg_addrs"
#scontrol = subprocess.run(['scontrol', 'update', 'nodename=' + node, 'nodeaddr=' + node], stdout=subprocess.PIPE, universal_newlines=True)
# try:
# suspend()
# except:
# logger.exception('Exception in main:')
# raise
def get_statesavelocation():
""" Return the path for Slurm's StateSaveLocation """
scontrol = subprocess.run(['scontrol', 'show', 'config'], stdout=subprocess.PIPE, universal_newlines=True)
for line in scontrol.stdout.splitlines():
if line.startswith('StateSaveLocation'): # StateSaveLocation = /var/spool/slurm
return line.split()[-1]
def expand_nodes(hostlist_expr):
scontrol = subprocess.run(['scontrol', 'show', 'hostnames', hostlist_expr], stdout=subprocess.PIPE, universal_newlines=True)
return scontrol.stdout.strip().split('\n')
def delete_server(conn, name):
try:
server = conn.compute.find_server(name)
conn.compute.delete_server(server)
except Exception as e:
logger.error(f"Error deleting server: {e}")
if __name__ == '__main__':
main()