Skip to content

Commit

Permalink
Merge pull request #807 from Joon-Klaps/kul_gpu
Browse files Browse the repository at this point in the history
Extend vsc_kul_uhasselt to support gpu partitions
  • Loading branch information
Joon-Klaps authored Dec 5, 2024
2 parents 304c9e4 + c4a9594 commit 3518758
Show file tree
Hide file tree
Showing 2 changed files with 129 additions and 113 deletions.
231 changes: 122 additions & 109 deletions conf/vsc_kul_uhasselt.config
Original file line number Diff line number Diff line change
@@ -1,52 +1,40 @@
// Default to /tmp directory if $VSC_SCRATCH scratch env is not available,
// see: https://github.com/nf-core/configs?tab=readme-ov-file#adding-a-new-config
def scratch_dir = System.getenv("VSC_SCRATCH") ?: "/tmp"

// Specify the work directory
workDir = "$scratch_dir/work"
scratch_dir = System.getenv("VSC_SCRATCH") ?: "/tmp"
tier1_project = System.getenv("SLURM_ACCOUNT") ?: null

// Perform work directory cleanup when the run has succesfully completed
// cleanup = true

// Get the hostname and check some values for tier1
def hostname = "genius"
try {
hostname = ['/bin/bash', '-c', 'sinfo --clusters=genius,wice -s | head -n 1'].execute().text.replace('CLUSTER: ','')
} catch (java.io.IOException e) {
System.err.println("WARNING: Could not run sinfo to determine current cluster, defaulting to genius")
}

def tier1_project = System.getenv("SLURM_ACCOUNT") ?: null

if (! tier1_project && (hostname.contains("genius") || hostname.contains("wice"))) {
// Hard-code that Tier 1 cluster dodrio requires a project account
System.err.println("Please specify your VSC project account with environment variable SLURM_ACCOUNT.")
System.exit(1)
}


// Reduce the job submit rate to about 50 per minute, this way the server won't be bombarded with jobs
// Limit queueSize to keep job rate under control and avoid timeouts
executor {
submitRateLimit = '50/1min'
queueSize = 30
queueSize = 50
exitReadTimeout = "10min"
}

// Add backoff strategy to catch cluster timeouts and proper symlinks of files in scratch to the work directory
process {
stageInMode = "symlink"
stageOutMode = "rsync"
errorStrategy = { sleep(Math.pow(2, task.attempt) * 200 as long); return 'retry' }
maxRetries = 5
// array = 50
executor = 'slurm'
stageInMode = "symlink"
stageOutMode = "rsync"
errorStrategy = { sleep(Math.pow(2, task.attempt ?: 1) * 200 as long); return 'retry' }
maxRetries = 3
array = 30
}

// Specify that singularity should be used and where the cache dir will be for the images
singularity {
enabled = true
autoMounts = true
cacheDir = "$scratch_dir/.singularity"
enabled = true
autoMounts = true
cacheDir = "$scratch_dir/.singularity"
pullTimeout = "30 min"
}

params {
config_profile_contact = 'GitHub: @Joon-Klaps - Email: [email protected]'
config_profile_url = 'https://docs.vscentrum.be/en/latest/index.html'
}

env {
Expand All @@ -56,112 +44,137 @@ env {

// AWS maximum retries for errors (This way the pipeline doesn't fail if the download fails one time)
aws {
maxErrorRetry = 3
maxErrorRetry = 3
}

// Define profiles for each cluster
profiles {
genius {
params {
config_profile_description = 'HPC_GENIUS profile for use on the genius cluster of the VSC HPC.'
config_profile_contact = 'GitHub: @Joon-Klaps - Email: [email protected]'
config_profile_url = 'https://docs.vscentrum.be/en/latest/index.html'
max_memory = 703.GB // 768 - 65 so 65GB for overhead, max is 720000MB
max_time = 168.h
max_cpus = 36
}
params.config_profile_description = 'genius profile for use on the genius cluster of the VSC HPC.'

process {
resourceLimits = [
memory: 703.GB,
cpus: 136,
time: 168.h
]
executor = 'slurm'
// 768 - 65 so 65GB for overhead, max is 720000MB
resourceLimits = [ memory: 703.GB, cpus: 36, time: 168.h ]
beforeScript = 'module load cluster/genius'
clusterOptions = { "--clusters=genius --account=$tier1_project" }

queue = {
switch (task.memory) {
case { it >= 175.GB }: // max is 180000
switch (task.time) {
case { it >= 72.h }:
return 'dedicated_big_bigmem,dedicated_big_batch,bigmem_long'
default:
return 'bigmem'
}
default:
switch (task.time) {
case { it >= 72.h }:
return 'batch_long'
default:
return 'batch'
}
task.memory >= 175.GB ?
(task.time >= 72.h ? 'dedicated_big_bigmem,dedicated_big_batch,bigmem_long' : 'bigmem') :
(task.time >= 72.h ? 'batch_long' : 'batch')
}

withLabel: '.*gpu.*'{
resourceLimits = [ memory: 703.GB, cpus: 36 , time: 168.h ]
apptainer.runOptions = '--containall --cleanenv --nv'
singularity.runOptions = '--containall --cleanenv --nv'
clusterOptions = {
// suggested to use 9 cpus per gpu
def gpus = task.accelerator?.request ?: Math.max(1, Math.floor((task.cpus ?:1)/9) as int)
"--gres=gpu:${gpus} --clusters=genius --account=$tier1_project"
}

queue = {
task.memory >= 175.GB ?
(task.time >= 72.h ? 'gpu_v100_long' : 'gpu_v100') :
(task.time >= 72.h ? 'gpu_p100_long,amd_long' : 'gpu_p100,amd')
}
}
clusterOptions = { "--clusters=genius --account=$tier1_project" }
scratch = "$scratch_dir"
}
}

wice {

params {
config_profile_description = 'HPC_WICE profile for use on the Wice cluster of the VSC HPC.'
config_profile_contact = 'GitHub: @Joon-Klaps - Email: [email protected]'
config_profile_url = 'https://docs.vscentrum.be/en/latest/index.html'
max_memory = 1968.GB // max is 2016000
max_cpus = 72
max_time = 168.h
genius_gpu {
params.config_profile_description = 'genius_gpu profile for use on the genius cluster of the VSC HPC.'
apptainer.runOptions = '--containall --cleanenv --nv'
singularity.runOptions = '--containall --cleanenv --nv'

process {
// 768 - 65 so 65GB for overhead, max is 720000MB
resourceLimits = [ memory: 703.GB, cpus: 36, time: 168.h]
beforeScript = 'module load cluster/genius'
clusterOptions = {
def gpus = task.accelerator?.request ?: Math.max(1, Math.floor((task.cpus ?:1)/9) as int)
"--gres=gpu:${gpus} --clusters=genius --account=$tier1_project"
}

queue = {
task.memory >= 175.GB ?
(task.time >= 72.h ? 'gpu_v100_long' : 'gpu_v100') :
(task.time >= 72.h ? 'gpu_p100_long,amd_long' : 'gpu_p100,amd')
}
}
}

wice {
params.config_profile_description = 'wice profile for use on the Wice cluster of the VSC HPC.'

process {
resourceLimits = [
memory: 1968.GB,
cpus: 72,
time: 168.h
]
executor = 'slurm'
// max is 2016000
resourceLimits = [ memory: 1968.GB, cpus: 72, time: 168.h ]
clusterOptions = { "--clusters=wice --account=$tier1_project"}
beforeScript = 'module load cluster/wice'

queue = {
switch (task.memory) {
case { it >= 239.GB }: // max is 244800
switch (task.time) {
case { it >= 72.h }:
return 'dedicated_big_bigmem'
default:
return 'bigmem,hugemem'
}
default:
switch (task.time) {
case { it >= 72.h }:
return 'batch_long,batch_icelake_long,batch_sapphirerapids_long'
default:
return 'batch,batch_sapphirerapids,batch_icelake'
}
task.memory >= 239.GB ?
(task.time >= 72.h ? 'dedicated_big_bigmem' : 'bigmem,hugemem') :
(task.time >= 72.h ? 'batch_long,batch_icelake_long,batch_sapphirerapids_long' : 'batch,batch_sapphirerapids,batch_icelake')
}

withLabel: '.*gpu.*'{
resourceLimits = [ memory: 703.GB, cpus: 64, time: 168.h ]
apptainer.runOptions = '--containall --cleanenv --nv'
singularity.runOptions = '--containall --cleanenv --nv'
clusterOptions = {
// suggested to use 16 cpus per gpu
def gpus = task.accelerator?.request ?: Math.max(1, Math.floor((task.cpus ?:1)/16) as int)
"--gres=gpu:${gpus} --clusters=wice --account=$tier1_project"
}

queue = {
task.memory >= 239.GB ?
(task.time >= 72.h ? 'dedicated_big_gpu_h100' : 'gpu_h100') :
(task.time >= 72.h ? 'dedicated_big_gpu' : 'gpu_a100,gpu')
}
}
clusterOptions = { "--clusters=wice --account=$tier1_project"}
scratch = "$scratch_dir"
}
}

superdome {
params {
config_profile_description = 'HPC_SUPERDOME profile for use on the genius cluster of the VSC HPC.'
config_profile_contact = 'GitHub: @Joon-Klaps - Email: [email protected]'
config_profile_url = 'https://docs.vscentrum.be/en/latest/index.html'
max_memory = 5772.GB // 6000 - 228 so 228GB for overhead, max is 5910888MB
max_cpus = 14
max_time = 168.h

wice_gpu {
params.config_profile_description = 'wice_gpu profile for use on the Wice cluster of the VSC HPC.'
apptainer.runOptions = '--containall --cleanenv --nv'
singularity.runOptions = '--containall --cleanenv --nv'

process {
// 768 - 65 so 65GB for overhead, max is 720000MB
resourceLimits = [ memory: 703.GB, cpus: 64, time: 168.h ]
beforeScript = 'module load cluster/wice'
clusterOptions = {
def gpus = task.accelerator?.request ?: Math.max(1, Math.floor((task.cpus ?:1)/16) as int)
"--gres=gpu:${gpus} --clusters=wice --account=$tier1_project"
}

queue = {
task.memory >= 239.GB ?
(task.time >= 72.h ? 'dedicated_big_gpu_h100' : 'gpu_h100') :
(task.time >= 72.h ? 'dedicated_big_gpu' : 'gpu_a100,gpu')
}
}
}

superdome {
params.config_profile_description = 'superdome profile for use on the genius cluster of the VSC HPC.'

process {
resourceLimits = [
memory: 5772.GB,
cpus: 14,
time: 168.h
]
executor = 'slurm'
queue = { task.time <= 72.h ? 'superdome' : 'superdome_long' }
clusterOptions = {"--clusters=genius --account=$tier1_project"}
scratch = "$scratch_dir"
beforeScript = 'module load cluster/genius/superdome'
// 6000 - 228 so 228GB for overhead, max is 5910888MB
resourceLimits = [ memory: 5772.GB, cpus: 14, time: 168.h]

queue = { task.time <= 72.h ? 'superdome' : 'superdome_long' }
}
}
}


11 changes: 7 additions & 4 deletions docs/vsc_kul_uhasselt.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,14 +28,14 @@ export NXF_CONDA_CACHEDIR="$VSC_SCRATCH/miniconda3/envs"

# Optional tower key
# export TOWER_ACCESS_TOKEN="<your_tower_access_token>"
# export NXF_VER="<version>" # make sure it's larger then 24.04.0
# export NXF_VER="<version>" # make sure it's larger then 24.10.1
```

:::warning
The current config is setup with array jobs. Make sure nextflow version >= 24.04.0, read [array jobs in nextflow](https://www.nextflow.io/docs/latest/process.html#array) you can do this in
The current config is setup with array jobs. Make sure nextflow version >= 24.10.1, read [array jobs in nextflow](https://www.nextflow.io/docs/latest/process.html#array) you can do this in

```bash
export NXF_VER=24.04.0
export NXF_VER=24.10.1
```

:::
Expand Down Expand Up @@ -64,10 +64,13 @@ nextflow run <pipeline> -profile vsc_kul_uhasselt,<CLUSTER> <Add your other para
Here the cluster options are:

- genius
- genius_gpu
- wice
- wice_gpu
- superdome

> **NB:** The vsc_kul_uhasselt profile is based on a selected amount of SLURM partitions. Should you require resources outside of these limits (e.g.gpus) you will need to provide a custom config specifying an appropriate SLURM partition (e.g. 'gpu\*').
> **NB:** The vsc_kul_uhasselt profile is based on a selected amount of SLURM partitions. The profile will select to its best ability the most appropriate partition for the job. Including modules with a label containing `gpu`will be allocated to a gpu partition when the 'normal' `genius` profile is selected. Select the `genius_gpu` or `wice_gpu` profile to force the job to be allocated to a gpu partition.
> **NB:** If the module does not have `accelerator` set, it will determine the number of GPUs based on the requested resources.
Use the `--cluster` option to specify the cluster you intend to use when submitting the job:

Expand Down

0 comments on commit 3518758

Please sign in to comment.