Skip to content

Commit

Permalink
Add VolumeSize when resuming xspot VM. (#307)
Browse files Browse the repository at this point in the history
* Add custom compute node prolog and epilog scripts

If the scripts exist then run them to allow modification of the instance at
run time.

Resolves #305

* Add VolumeSize when resuming xspot VM.

Need to specify a default VolumeSize and check that it is at least as large
as the AMI for the image used by the VM.

Resolves #306
  • Loading branch information
cartalla authored Feb 10, 2025
1 parent cc199b5 commit cdf9073
Show file tree
Hide file tree
Showing 4 changed files with 75 additions and 30 deletions.
73 changes: 50 additions & 23 deletions source/cdk/cdk_slurm_stack.py
Original file line number Diff line number Diff line change
Expand Up @@ -829,7 +829,8 @@ def update_config_for_exostellar(self):
Configure /home file system.
'''
logger.info(f"Updating configuration for Exostellar")
ems_stack_name = self.config['slurm']['Xio']['ManagementServerStackName']
xio_config = self.config['slurm']['Xio']
ems_stack_name = xio_config['ManagementServerStackName']
logger.info(f" stack: {ems_stack_name}")

# Get RES environment name from stack parameters.
Expand Down Expand Up @@ -897,26 +898,26 @@ def update_config_for_exostellar(self):
if not exostellar_security_group:
logger.error(f"ExostellarSecurityGroup resource not found in {ems_stack_name} EMS stack")
exit(1)
if 'Controllers' not in self.config['slurm']['Xio']:
self.config['slurm']['Xio']['Controllers'] = {}
if 'SecurityGroupIds' not in self.config['slurm']['Xio']['Controllers']:
self.config['slurm']['Xio']['Controllers']['SecurityGroupIds'] = []
if 'Workers' not in self.config['slurm']['Xio']:
self.config['slurm']['Xio']['Workers'] = {}
if 'SecurityGroupIds' not in self.config['slurm']['Xio']['Workers']:
self.config['slurm']['Xio']['Workers']['SecurityGroupIds'] = []
if exostellar_security_group not in self.config['slurm']['Xio']['Controllers']['SecurityGroupIds']:
self.config['slurm']['Xio']['Controllers']['SecurityGroupIds'].append(exostellar_security_group)
if exostellar_security_group not in self.config['slurm']['Xio']['Workers']['SecurityGroupIds']:
self.config['slurm']['Xio']['Workers']['SecurityGroupIds'].append(exostellar_security_group)
if 'Controllers' not in xio_config:
xio_config['Controllers'] = {}
if 'SecurityGroupIds' not in xio_config['Controllers']:
xio_config['Controllers']['SecurityGroupIds'] = []
if 'Workers' not in xio_config:
xio_config['Workers'] = {}
if 'SecurityGroupIds' not in xio_config['Workers']:
xio_config['Workers']['SecurityGroupIds'] = []
if exostellar_security_group not in xio_config['Controllers']['SecurityGroupIds']:
xio_config['Controllers']['SecurityGroupIds'].append(exostellar_security_group)
if exostellar_security_group not in xio_config['Workers']['SecurityGroupIds']:
xio_config['Workers']['SecurityGroupIds'].append(exostellar_security_group)
if 'AdditionalSecurityGroupsStackName' in self.config:
if self.slurm_compute_node_sg_id:
if self.slurm_compute_node_sg_id not in self.config['slurm']['Xio']['Workers']['SecurityGroupIds']:
self.config['slurm']['Xio']['Workers']['SecurityGroupIds'].append(self.slurm_compute_node_sg_id)
if self.slurm_compute_node_sg_id not in xio_config['Workers']['SecurityGroupIds']:
xio_config['Workers']['SecurityGroupIds'].append(self.slurm_compute_node_sg_id)
if 'RESStackName' in self.config:
if self.res_dcv_security_group_id:
if self.res_dcv_security_group_id not in self.config['slurm']['Xio']['Workers']['SecurityGroupIds']:
self.config['slurm']['Xio']['Workers']['SecurityGroupIds'].append(self.res_dcv_security_group_id)
if self.res_dcv_security_group_id not in xio_config['Workers']['SecurityGroupIds']:
xio_config['Workers']['SecurityGroupIds'].append(self.res_dcv_security_group_id)

# Get values from stack outputs
ems_ip_address = None
Expand All @@ -927,10 +928,31 @@ def update_config_for_exostellar(self):
if not ems_ip_address:
logger.error(f"2ExostellarMgmtServerPrivateIP output not found in {ems_stack_name} EMS stack.")
exit(1)
self.config['slurm']['Xio']['ManagementServerIp'] = ems_ip_address
xio_config['ManagementServerIp'] = ems_ip_address

# Get VolumeSize for AMIs used by Images
image_configs = {}
for image_config in xio_config['Images']:
image_id = image_config['ImageId']
image_name = image_config['ImageName']
volume_size = image_config.get('VolumeSize', xio_config['DefaultVolumeSize'])
image_configs[image_name] = image_config
images_info = self.ec2_client.describe_images(ImageIds=[image_id])['Images']
if not images_info:
logger.error(f"slurm/Xio/Images error. ImageId {image_id} doesn't exist.")
exit(1)
ami_info = images_info[0]
if len(ami_info['BlockDeviceMappings']) != 1:
logger.error(f"Images for XIO must have exactly 1 EBS volume. {image_id} has {len(ami_info['BlockDeviceMappings'])}")
exit(1)
min_volume_size = ami_info['BlockDeviceMappings'][0]['Ebs']['VolumeSize']
if volume_size < min_volume_size:
logger.info(f"Increased {image_name} VolumeSize from {volume_size} to {min_volume_size} to match {image_id}.")
volume_size = min_volume_size
image_config['VolumeSize'] = volume_size

# Check that all of the profiles used by the pools are defined
logger.debug(f"Xio config:\n{json.dumps(self.config['slurm']['Xio'], indent=4)}")
logger.debug(f"Xio config:\n{json.dumps(xio_config, indent=4)}")
WEIGHT_PER_CORE = {
'amd': 45,
'intel': 78
Expand All @@ -944,7 +966,7 @@ def update_config_for_exostellar(self):
xio_profile_configs = {}
self.instance_type_info = self.plugin.get_instance_types_info(self.cluster_region)
self.instance_family_info = self.plugin.get_instance_families_info(self.cluster_region)
for profile_config in self.config['slurm']['Xio']['Profiles']:
for profile_config in xio_config['Profiles']:
profile_name = profile_config['ProfileName']
# Check that profile name is alphanumeric
if not re.compile('^[a-zA-z0-9]+$').fullmatch(profile_name):
Expand Down Expand Up @@ -994,7 +1016,7 @@ def update_config_for_exostellar(self):
profile_config['SpotFleetTypes'].remove(invalid_instance_type)

xio_pool_names = {}
for pool_config in self.config['slurm']['Xio']['Pools']:
for pool_config in xio_config['Pools']:
pool_name = pool_config['PoolName']
if pool_name in xio_pool_names:
logger.error(f"{pool_name} Xio pool already defined")
Expand All @@ -1006,11 +1028,11 @@ def update_config_for_exostellar(self):
number_of_errors += 1
continue
if 'ImageName' not in pool_config:
if 'DefaultImageName' not in self.config['slurm']['Xio']:
if 'DefaultImageName' not in xio_config:
logger.error(f"Xio pool {pool_name} didn't specify ImageName and Xio DefaultImageName not set.")
number_of_errors += 1
else:
pool_config['ImageName'] = self.config['slurm']['Xio']['DefaultImageName']
pool_config['ImageName'] = xio_config['DefaultImageName']
if 'InstanceMemory' not in pool_config and 'MaxMemory' not in pool_config:
logger.error(f"Must specify either InstanceMemory or MaxMemory in {pool_name} config.")
number_of_errors += 1
Expand All @@ -1026,6 +1048,11 @@ def update_config_for_exostellar(self):
profile_config = xio_profile_configs[profile_name]
cpu_vendor = profile_config['CpuVendor']
pool_config['Weight'] = pool_config['CPUs'] * WEIGHT_PER_CORE[cpu_vendor] + int(pool_config.get('InstanceMemory', pool_config.get('MaxMemory'))/1024 * WEIGHT_PER_GB[cpu_vendor])
# Set/validate pool's VolumeSize
image_config = image_configs.get(pool_config['ImageName'], {})
pool_config['VolumeSize'] = pool_config.get('VolumeSize', image_config.get('VolumeSize', xio_config['DefaultVolumeSize']))
if image_config['VolumeSize'] < image_config.get('VolumeSize', 0):
logger.error(f"Pool {pool_config['PoolName']} VolumeSize must be >= VolumeSize for image {pool_config['ImageName']}={image_config['VolumeSize']}")

if number_of_errors:
exit(1)
Expand Down
6 changes: 4 additions & 2 deletions source/cdk/config_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -1716,11 +1716,13 @@ def get_config_schema(config):
'PartitionName': str,
Optional('Images'): [
{
'ImageName': str,
'ImageId': str,
'ImageName': str
Optional('VolumeSize'): int
}
],
Optional('DefaultImageName'): str,
Optional('DefaultVolumeSize', default=10): int,
Optional('Profiles', default=default_xio_profiles): [
{
'ProfileName': str,
Expand All @@ -1745,7 +1747,7 @@ def get_config_schema(config):
Optional('ImageName'): str,
Optional('InstanceMemory'): int,
Optional('MaxMemory'): int,
Optional('VolumeSize', default=10): int,
Optional('VolumeSize'): int,
Optional('Weight'): int
}
],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,10 @@ trap on_exit EXIT
config_dir=/opt/slurm/config
config_bin_dir=$config_dir/bin

if [ -e $config_bin_dir/on_compute_node_configured_custom_prolog.sh ]; then
$config_bin_dir/on_compute_node_configured_custom_prolog.sh
fi

if ! [ -z $HomeMountSrc ]; then
umount /home
mount $HomeMountSrc /home
Expand Down Expand Up @@ -82,6 +86,10 @@ fi
# -e @$ANSIBLE_PATH/ansible_compute_node_vars.yml &
# popd

if [ -e $config_bin_dir/on_compute_node_configured_custom_epilog.sh ]; then
$config_bin_dir/on_compute_node_configured_custom_epilog.sh
fi

echo "$(date): Finished ${script_name}"

exit 0
Original file line number Diff line number Diff line change
Expand Up @@ -39,34 +39,41 @@ function resume_xspot()
image_name=''
cpus=''
mem=''
vol_size=''
{% for pool_config in xio_config.Pools %}
if [[ $pool_name == '{{ pool_config.PoolName }}' ]]; then
profile_name='{{ pool_config.ProfileName }}'
image_name='{{ pool_config.ImageName }}'
cpus={{ pool_config.CPUs }}
mem={{ pool_config.MaxMemory }}
vol_size={{ pool_config.VolumeSize }}
fi
{% endfor %}
if [[ -z $profile_name ]]; then
echo "error: No profile_name for $host
echo "error: No profile_name for $host"
return 1
fi
if [[ -z $image_name ]]; then
echo "error: No image_name for $host
echo "error: No image_name for $host"
return 1
fi
if [[ -z $cpus ]]; then
echo "error: No cpus for $host
echo "error: No cpus for $host"
return 1
fi
if [[ -z $mem ]]; then
echo "error: No mem for $host
echo "error: No mem for $host"
return 1
fi
if [[ -z $vol_size ]]; then
echo "error: No vol_size for $host"
return 1
fi
echo "ProfileName=$profile_name"
echo "ImageName=$image_name"
echo "CPUs=$cpus"
echo "MaxMemory=$mem"
echo "VolumeSize=$vol_size"

TMP_USER_DATA_FILE=$(mktemp).sh
cp ${SLURM_CONF_PATH}/exostellar/xspot-vm_user_data.sh $TMP_USER_DATA_FILE
Expand All @@ -84,7 +91,8 @@ function resume_xspot()
"CPUs": $cpus,
"ImageName": "$image_name",
"MaxMemory": $mem,
"UserData": "$user_data"
"UserData": "$user_data",
"VolumeSize": $vol_size
}
}
END
Expand Down

0 comments on commit cdf9073

Please sign in to comment.