From 759126f05d601a591f9026a13e11a74fde0db716 Mon Sep 17 00:00:00 2001 From: Maxime Boissonneault Date: Thu, 9 Jan 2025 10:15:46 -0500 Subject: [PATCH 1/4] Adds support for GPU shards detection. Supports heterogenous node configuration for gpu/shard. --- slurmformspawner/form.py | 49 ++++++++++++++++++++++++++++++--------- slurmformspawner/slurm.py | 2 +- 2 files changed, 39 insertions(+), 12 deletions(-) diff --git a/slurmformspawner/form.py b/slurmformspawner/form.py index 76e74df..33960f6 100644 --- a/slurmformspawner/form.py +++ b/slurmformspawner/form.py @@ -267,20 +267,47 @@ def config_gpus(self): lock = self.resolve(self.gpus.get('lock')) gpu_choice_map = {} - for gres in choices: - if gres == 'gpu:0': + for choice in choices: + if choice == 'gpu:0': gpu_choice_map['gpu:0'] = 'None' continue - match = re.match(r"(gpu:[\w:.]+)", gres) - if match: - gres = match.group(1).split(':') - number = int(gres[-1]) - if len(gres) == 2: - strings = ('gpu:{}', '{} x GPU') - elif len(gres) > 2: - strings = ('gpu:{}:{{}}'.format(gres[1]), '{{}} x {}'.format(gres[1].upper())) - for i in range(1, number + 1): + + # we now have one choice per type of gres configuration to support + # heterogenous cluster configuration, each node could have multiple types of gres + gres_list = choice.split(',') + + # if the node has shards, we need the number of gpus and number of shards + max_shard_per_gpu = 0 + if any(gres.startswith('shard') for gres in gres_list): + # get total number of gpus and shard + num_gpu = 0 + for gres_def in gres_list: + match = re.match(r"(gpu:[\w:.]+)", gres_def) + if match: + gres = match.group(1).split(':') + num_gpu += int(gres[-1]) + match = re.match(r"(shard:[\w:.]+)", gres_def) + if match: + gres = match.group(1).split(':') + num_shard = int(gres[-1]) + max_shard_per_gpu = max(max_shard_per_gpu, int(num_shard / num_gpu)) + if max_shard_per_gpu > 0: + strings = ('shard:{}', '{} x shard') + for i in range(1, max_shard_per_gpu + 1): gpu_choice_map[strings[0].format(i)] = strings[1].format(i) + + for gres_def in gres_list: + match = re.match(r"(gpu:[\w:.]+)", gres_def) + if match: + gres = match.group(1).split(':') + number = int(gres[-1]) + if len(gres) == 2: + strings = ('gpu:{}', '{} x GPU') + elif len(gres) > 2: + strings = ('gpu:{}:{{}}'.format(gres[1]), '{{}} x {}'.format(gres[1].upper())) + for i in range(1, number + 1): + gpu_choice_map[strings[0].format(i)] = strings[1].format(i) + self.form['gpus'].choices = list(gpu_choice_map.items()) if lock: self.form['gpus'].render_kw = {'disabled': 'disabled'} diff --git a/slurmformspawner/slurm.py b/slurmformspawner/slurm.py index ab5e8e7..6399c68 100644 --- a/slurmformspawner/slurm.py +++ b/slurmformspawner/slurm.py @@ -36,7 +36,7 @@ def get_node_info(self): for node in nodes: output['cpu'].append(int(node['CPUTot'])) output['mem'].append(int(node['RealMemory']) - int(node.get('MemSpecLimit', '0'))) - output['gres'].extend(node['Gres'].split(",")) + output['gres'].extend([node['Gres']]) output['partitions'].extend(node['Partitions'].split(",")) return output From be6a039da008ec5f8bdf255851eb196ab892a90f Mon Sep 17 00:00:00 2001 From: Maxime Boissonneault Date: Fri, 17 Jan 2025 15:16:56 -0500 Subject: [PATCH 2/4] combine shard and gpu loops, keep shard types and list possible outcomes instead of 'shard' --- slurmformspawner/form.py | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/slurmformspawner/form.py b/slurmformspawner/form.py index 33960f6..020ce95 100644 --- a/slurmformspawner/form.py +++ b/slurmformspawner/form.py @@ -267,6 +267,9 @@ def config_gpus(self): lock = self.resolve(self.gpus.get('lock')) gpu_choice_map = {} + # if the node has shards, we need the number of gpus and number of shards + max_shard_per_gpu = 0 + gpu_types = [] for choice in choices: if choice == 'gpu:0': gpu_choice_map['gpu:0'] = 'None' @@ -276,37 +279,34 @@ def config_gpus(self): # heterogenous cluster configuration, each node could have multiple types of gres gres_list = choice.split(',') - # if the node has shards, we need the number of gpus and number of shards - max_shard_per_gpu = 0 - if any(gres.startswith('shard') for gres in gres_list): - # get total number of gpus and shard - num_gpu = 0 - for gres_def in gres_list: - match = re.match(r"(gpu:[\w:.]+)", gres_def) - if match: - gres = match.group(1).split(':') - num_gpu += int(gres[-1]) - match = re.match(r"(shard:[\w:.]+)", gres_def) - if match: - gres = match.group(1).split(':') - num_shard = int(gres[-1]) - max_shard_per_gpu = max(max_shard_per_gpu, int(num_shard / num_gpu)) - if max_shard_per_gpu > 0: - strings = ('shard:{}', '{} x shard') - for i in range(1, max_shard_per_gpu + 1): - gpu_choice_map[strings[0].format(i)] = strings[1].format(i) - + total_gpu = 0 + num_shard = 0 for gres_def in gres_list: match = re.match(r"(gpu:[\w:.]+)", gres_def) if match: gres = match.group(1).split(':') number = int(gres[-1]) + total_gpu += number if len(gres) == 2: strings = ('gpu:{}', '{} x GPU') + gpu_types += ['GPU'] elif len(gres) > 2: strings = ('gpu:{}:{{}}'.format(gres[1]), '{{}} x {}'.format(gres[1].upper())) + gpu_types += [gres[1].upper()] for i in range(1, number + 1): gpu_choice_map[strings[0].format(i)] = strings[1].format(i) + else: + match = re.match(r"(shard:[\w:.]+)", gres_def) + if match: + gres = match.group(1).split(':') + num_shard = int(gres[-1]) + max_shard_per_gpu = max(max_shard_per_gpu, int(num_shard / total_gpu)) + gpu_types = set(gpu_types) + + if max_shard_per_gpu > 0: + strings = ('shard:{}', '{}/{} x ({})') + for i in range(1, max_shard_per_gpu): + gpu_choice_map[strings[0].format(i)] = strings[1].format(i, max_shard_per_gpu, '|'.join(gpu_types)) self.form['gpus'].choices = list(gpu_choice_map.items()) if lock: From 01f71afd3e0ad1075f42336560ceff87a3d61cbe Mon Sep 17 00:00:00 2001 From: Maxime Boissonneault Date: Mon, 20 Jan 2025 09:30:35 -0500 Subject: [PATCH 3/4] only add gpu_type to the list of sharded gpus if there are shards --- slurmformspawner/form.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/slurmformspawner/form.py b/slurmformspawner/form.py index 020ce95..123c588 100644 --- a/slurmformspawner/form.py +++ b/slurmformspawner/form.py @@ -281,6 +281,7 @@ def config_gpus(self): total_gpu = 0 num_shard = 0 + gpu_type = '' for gres_def in gres_list: match = re.match(r"(gpu:[\w:.]+)", gres_def) if match: @@ -289,10 +290,10 @@ def config_gpus(self): total_gpu += number if len(gres) == 2: strings = ('gpu:{}', '{} x GPU') - gpu_types += ['GPU'] + gpu_type = 'GPU' elif len(gres) > 2: strings = ('gpu:{}:{{}}'.format(gres[1]), '{{}} x {}'.format(gres[1].upper())) - gpu_types += [gres[1].upper()] + gpu_type = gres[1].upper() for i in range(1, number + 1): gpu_choice_map[strings[0].format(i)] = strings[1].format(i) else: @@ -300,6 +301,8 @@ def config_gpus(self): if match: gres = match.group(1).split(':') num_shard = int(gres[-1]) + if num_shard > 0: + gpu_types += [gpu_type] max_shard_per_gpu = max(max_shard_per_gpu, int(num_shard / total_gpu)) gpu_types = set(gpu_types) From e57414213b67ae19c0d8d2dd676cfec709752fb7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A9lix-Antoine=20Fortin?= Date: Mon, 20 Jan 2025 10:56:53 -0500 Subject: [PATCH 4/4] Make gpu_types a set at the beginning --- slurmformspawner/form.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/slurmformspawner/form.py b/slurmformspawner/form.py index 123c588..d53ebf2 100644 --- a/slurmformspawner/form.py +++ b/slurmformspawner/form.py @@ -269,7 +269,7 @@ def config_gpus(self): gpu_choice_map = {} # if the node has shards, we need the number of gpus and number of shards max_shard_per_gpu = 0 - gpu_types = [] + gpu_types = set() for choice in choices: if choice == 'gpu:0': gpu_choice_map['gpu:0'] = 'None' @@ -302,9 +302,8 @@ def config_gpus(self): gres = match.group(1).split(':') num_shard = int(gres[-1]) if num_shard > 0: - gpu_types += [gpu_type] + gpu_types.add(gpu_type) max_shard_per_gpu = max(max_shard_per_gpu, int(num_shard / total_gpu)) - gpu_types = set(gpu_types) if max_shard_per_gpu > 0: strings = ('shard:{}', '{}/{} x ({})')