-
Notifications
You must be signed in to change notification settings - Fork 277
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #807 from Joon-Klaps/kul_gpu
Extend vsc_kul_uhasselt to support gpu partitions
- Loading branch information
Showing
2 changed files
with
129 additions
and
113 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,52 +1,40 @@ | ||
// Default to /tmp directory if $VSC_SCRATCH scratch env is not available, | ||
// see: https://github.com/nf-core/configs?tab=readme-ov-file#adding-a-new-config | ||
def scratch_dir = System.getenv("VSC_SCRATCH") ?: "/tmp" | ||
|
||
// Specify the work directory | ||
workDir = "$scratch_dir/work" | ||
scratch_dir = System.getenv("VSC_SCRATCH") ?: "/tmp" | ||
tier1_project = System.getenv("SLURM_ACCOUNT") ?: null | ||
|
||
// Perform work directory cleanup when the run has succesfully completed | ||
// cleanup = true | ||
|
||
// Get the hostname and check some values for tier1 | ||
def hostname = "genius" | ||
try { | ||
hostname = ['/bin/bash', '-c', 'sinfo --clusters=genius,wice -s | head -n 1'].execute().text.replace('CLUSTER: ','') | ||
} catch (java.io.IOException e) { | ||
System.err.println("WARNING: Could not run sinfo to determine current cluster, defaulting to genius") | ||
} | ||
|
||
def tier1_project = System.getenv("SLURM_ACCOUNT") ?: null | ||
|
||
if (! tier1_project && (hostname.contains("genius") || hostname.contains("wice"))) { | ||
// Hard-code that Tier 1 cluster dodrio requires a project account | ||
System.err.println("Please specify your VSC project account with environment variable SLURM_ACCOUNT.") | ||
System.exit(1) | ||
} | ||
|
||
|
||
// Reduce the job submit rate to about 50 per minute, this way the server won't be bombarded with jobs | ||
// Limit queueSize to keep job rate under control and avoid timeouts | ||
executor { | ||
submitRateLimit = '50/1min' | ||
queueSize = 30 | ||
queueSize = 50 | ||
exitReadTimeout = "10min" | ||
} | ||
|
||
// Add backoff strategy to catch cluster timeouts and proper symlinks of files in scratch to the work directory | ||
process { | ||
stageInMode = "symlink" | ||
stageOutMode = "rsync" | ||
errorStrategy = { sleep(Math.pow(2, task.attempt) * 200 as long); return 'retry' } | ||
maxRetries = 5 | ||
// array = 50 | ||
executor = 'slurm' | ||
stageInMode = "symlink" | ||
stageOutMode = "rsync" | ||
errorStrategy = { sleep(Math.pow(2, task.attempt ?: 1) * 200 as long); return 'retry' } | ||
maxRetries = 3 | ||
array = 30 | ||
} | ||
|
||
// Specify that singularity should be used and where the cache dir will be for the images | ||
singularity { | ||
enabled = true | ||
autoMounts = true | ||
cacheDir = "$scratch_dir/.singularity" | ||
enabled = true | ||
autoMounts = true | ||
cacheDir = "$scratch_dir/.singularity" | ||
pullTimeout = "30 min" | ||
} | ||
|
||
params { | ||
config_profile_contact = 'GitHub: @Joon-Klaps - Email: [email protected]' | ||
config_profile_url = 'https://docs.vscentrum.be/en/latest/index.html' | ||
} | ||
|
||
env { | ||
|
@@ -56,112 +44,137 @@ env { | |
|
||
// AWS maximum retries for errors (This way the pipeline doesn't fail if the download fails one time) | ||
aws { | ||
maxErrorRetry = 3 | ||
maxErrorRetry = 3 | ||
} | ||
|
||
// Define profiles for each cluster | ||
profiles { | ||
genius { | ||
params { | ||
config_profile_description = 'HPC_GENIUS profile for use on the genius cluster of the VSC HPC.' | ||
config_profile_contact = 'GitHub: @Joon-Klaps - Email: [email protected]' | ||
config_profile_url = 'https://docs.vscentrum.be/en/latest/index.html' | ||
max_memory = 703.GB // 768 - 65 so 65GB for overhead, max is 720000MB | ||
max_time = 168.h | ||
max_cpus = 36 | ||
} | ||
params.config_profile_description = 'genius profile for use on the genius cluster of the VSC HPC.' | ||
|
||
process { | ||
resourceLimits = [ | ||
memory: 703.GB, | ||
cpus: 136, | ||
time: 168.h | ||
] | ||
executor = 'slurm' | ||
// 768 - 65 so 65GB for overhead, max is 720000MB | ||
resourceLimits = [ memory: 703.GB, cpus: 36, time: 168.h ] | ||
beforeScript = 'module load cluster/genius' | ||
clusterOptions = { "--clusters=genius --account=$tier1_project" } | ||
|
||
queue = { | ||
switch (task.memory) { | ||
case { it >= 175.GB }: // max is 180000 | ||
switch (task.time) { | ||
case { it >= 72.h }: | ||
return 'dedicated_big_bigmem,dedicated_big_batch,bigmem_long' | ||
default: | ||
return 'bigmem' | ||
} | ||
default: | ||
switch (task.time) { | ||
case { it >= 72.h }: | ||
return 'batch_long' | ||
default: | ||
return 'batch' | ||
} | ||
task.memory >= 175.GB ? | ||
(task.time >= 72.h ? 'dedicated_big_bigmem,dedicated_big_batch,bigmem_long' : 'bigmem') : | ||
(task.time >= 72.h ? 'batch_long' : 'batch') | ||
} | ||
|
||
withLabel: '.*gpu.*'{ | ||
resourceLimits = [ memory: 703.GB, cpus: 36 , time: 168.h ] | ||
apptainer.runOptions = '--containall --cleanenv --nv' | ||
singularity.runOptions = '--containall --cleanenv --nv' | ||
clusterOptions = { | ||
// suggested to use 9 cpus per gpu | ||
def gpus = task.accelerator?.request ?: Math.max(1, Math.floor((task.cpus ?:1)/9) as int) | ||
"--gres=gpu:${gpus} --clusters=genius --account=$tier1_project" | ||
} | ||
|
||
queue = { | ||
task.memory >= 175.GB ? | ||
(task.time >= 72.h ? 'gpu_v100_long' : 'gpu_v100') : | ||
(task.time >= 72.h ? 'gpu_p100_long,amd_long' : 'gpu_p100,amd') | ||
} | ||
} | ||
clusterOptions = { "--clusters=genius --account=$tier1_project" } | ||
scratch = "$scratch_dir" | ||
} | ||
} | ||
|
||
wice { | ||
|
||
params { | ||
config_profile_description = 'HPC_WICE profile for use on the Wice cluster of the VSC HPC.' | ||
config_profile_contact = 'GitHub: @Joon-Klaps - Email: [email protected]' | ||
config_profile_url = 'https://docs.vscentrum.be/en/latest/index.html' | ||
max_memory = 1968.GB // max is 2016000 | ||
max_cpus = 72 | ||
max_time = 168.h | ||
genius_gpu { | ||
params.config_profile_description = 'genius_gpu profile for use on the genius cluster of the VSC HPC.' | ||
apptainer.runOptions = '--containall --cleanenv --nv' | ||
singularity.runOptions = '--containall --cleanenv --nv' | ||
|
||
process { | ||
// 768 - 65 so 65GB for overhead, max is 720000MB | ||
resourceLimits = [ memory: 703.GB, cpus: 36, time: 168.h] | ||
beforeScript = 'module load cluster/genius' | ||
clusterOptions = { | ||
def gpus = task.accelerator?.request ?: Math.max(1, Math.floor((task.cpus ?:1)/9) as int) | ||
"--gres=gpu:${gpus} --clusters=genius --account=$tier1_project" | ||
} | ||
|
||
queue = { | ||
task.memory >= 175.GB ? | ||
(task.time >= 72.h ? 'gpu_v100_long' : 'gpu_v100') : | ||
(task.time >= 72.h ? 'gpu_p100_long,amd_long' : 'gpu_p100,amd') | ||
} | ||
} | ||
} | ||
|
||
wice { | ||
params.config_profile_description = 'wice profile for use on the Wice cluster of the VSC HPC.' | ||
|
||
process { | ||
resourceLimits = [ | ||
memory: 1968.GB, | ||
cpus: 72, | ||
time: 168.h | ||
] | ||
executor = 'slurm' | ||
// max is 2016000 | ||
resourceLimits = [ memory: 1968.GB, cpus: 72, time: 168.h ] | ||
clusterOptions = { "--clusters=wice --account=$tier1_project"} | ||
beforeScript = 'module load cluster/wice' | ||
|
||
queue = { | ||
switch (task.memory) { | ||
case { it >= 239.GB }: // max is 244800 | ||
switch (task.time) { | ||
case { it >= 72.h }: | ||
return 'dedicated_big_bigmem' | ||
default: | ||
return 'bigmem,hugemem' | ||
} | ||
default: | ||
switch (task.time) { | ||
case { it >= 72.h }: | ||
return 'batch_long,batch_icelake_long,batch_sapphirerapids_long' | ||
default: | ||
return 'batch,batch_sapphirerapids,batch_icelake' | ||
} | ||
task.memory >= 239.GB ? | ||
(task.time >= 72.h ? 'dedicated_big_bigmem' : 'bigmem,hugemem') : | ||
(task.time >= 72.h ? 'batch_long,batch_icelake_long,batch_sapphirerapids_long' : 'batch,batch_sapphirerapids,batch_icelake') | ||
} | ||
|
||
withLabel: '.*gpu.*'{ | ||
resourceLimits = [ memory: 703.GB, cpus: 64, time: 168.h ] | ||
apptainer.runOptions = '--containall --cleanenv --nv' | ||
singularity.runOptions = '--containall --cleanenv --nv' | ||
clusterOptions = { | ||
// suggested to use 16 cpus per gpu | ||
def gpus = task.accelerator?.request ?: Math.max(1, Math.floor((task.cpus ?:1)/16) as int) | ||
"--gres=gpu:${gpus} --clusters=wice --account=$tier1_project" | ||
} | ||
|
||
queue = { | ||
task.memory >= 239.GB ? | ||
(task.time >= 72.h ? 'dedicated_big_gpu_h100' : 'gpu_h100') : | ||
(task.time >= 72.h ? 'dedicated_big_gpu' : 'gpu_a100,gpu') | ||
} | ||
} | ||
clusterOptions = { "--clusters=wice --account=$tier1_project"} | ||
scratch = "$scratch_dir" | ||
} | ||
} | ||
|
||
superdome { | ||
params { | ||
config_profile_description = 'HPC_SUPERDOME profile for use on the genius cluster of the VSC HPC.' | ||
config_profile_contact = 'GitHub: @Joon-Klaps - Email: [email protected]' | ||
config_profile_url = 'https://docs.vscentrum.be/en/latest/index.html' | ||
max_memory = 5772.GB // 6000 - 228 so 228GB for overhead, max is 5910888MB | ||
max_cpus = 14 | ||
max_time = 168.h | ||
|
||
wice_gpu { | ||
params.config_profile_description = 'wice_gpu profile for use on the Wice cluster of the VSC HPC.' | ||
apptainer.runOptions = '--containall --cleanenv --nv' | ||
singularity.runOptions = '--containall --cleanenv --nv' | ||
|
||
process { | ||
// 768 - 65 so 65GB for overhead, max is 720000MB | ||
resourceLimits = [ memory: 703.GB, cpus: 64, time: 168.h ] | ||
beforeScript = 'module load cluster/wice' | ||
clusterOptions = { | ||
def gpus = task.accelerator?.request ?: Math.max(1, Math.floor((task.cpus ?:1)/16) as int) | ||
"--gres=gpu:${gpus} --clusters=wice --account=$tier1_project" | ||
} | ||
|
||
queue = { | ||
task.memory >= 239.GB ? | ||
(task.time >= 72.h ? 'dedicated_big_gpu_h100' : 'gpu_h100') : | ||
(task.time >= 72.h ? 'dedicated_big_gpu' : 'gpu_a100,gpu') | ||
} | ||
} | ||
} | ||
|
||
superdome { | ||
params.config_profile_description = 'superdome profile for use on the genius cluster of the VSC HPC.' | ||
|
||
process { | ||
resourceLimits = [ | ||
memory: 5772.GB, | ||
cpus: 14, | ||
time: 168.h | ||
] | ||
executor = 'slurm' | ||
queue = { task.time <= 72.h ? 'superdome' : 'superdome_long' } | ||
clusterOptions = {"--clusters=genius --account=$tier1_project"} | ||
scratch = "$scratch_dir" | ||
beforeScript = 'module load cluster/genius/superdome' | ||
// 6000 - 228 so 228GB for overhead, max is 5910888MB | ||
resourceLimits = [ memory: 5772.GB, cpus: 14, time: 168.h] | ||
|
||
queue = { task.time <= 72.h ? 'superdome' : 'superdome_long' } | ||
} | ||
} | ||
} | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters